

```{r Library}


library(plyr)
library(tidyverse)
library(haven)
library(janitor)
library(tidycensus)

`%nin%` <- negate(`%in%`)

```





```{r Data load}

## Previous surveys were collected and combined into a single dataset
## See appendix for full details of surveys

df1 <- read_csv("Data/Combined_Surveys5.csv")

df_collected <- df1


## Load all datasets here


## LNPS
lnps_89 <- read_csv("Data/lnps.csv")


## Pew
kaiser_99 <- read_dta("Data/kaiser99.dta")
pew_02 <- read_sav("Data/pew_02.sav")
pew_04 <- read_sav("Data/pew_04.sav")
pew_06 <- read_sav("Data/pew_06.sav")
pew_07 <- read_sav("Data/pew_07.sav")
pew_08 <- read_sav("Data/pew_08.sav")
pew_09 <- read_sav("Data/pew_09.sav")
pew_10 <- read_sav("Data/pew_10.sav")
pew_11 <- read_sav("Data/pew_11.sav")
pew_12 <- read_sav("Data/pew_12.sav")
pew_13 <- read_sav("Data/pew_13.sav")
pew_14 <- read_sav("Data/pew_14.sav", encoding = "latin1") 
pew_15 <- read_sav("Data/pew_15.sav")
pew_16 <- read_sav("Data/pew_16.sav")
pew_17 <- read_sav("Data/pew_17.sav")
pew_18 <- read_sav("Data/pew_18.sav")

## LNS
lns_06 <- read_csv("data/LNS.csv")

## CMPS
cmps_16 <- read_dta("Data/cmps_16.dta")
cmps_20 <- load("Data/cmps_20.rdata")


## CES
ces_full <- readRDS("Data/cumulative_2006-2023.rds")

## Getting immstat variables
ces_immstat <- rbind.fill(list(read.csv("Data/Cumulative CCES 2006-2018/CCES2006_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2007_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2008_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2009_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2010_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2011_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2012_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2013_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2014_Immstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2016_MexicoImmstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2017_MexicoImmstat.csv", stringsAsFactors=FALSE), read.csv("Data/Cumulative CCES 2006-2018/CCES2018_MexicoImmstat.csv", stringsAsFactors=FALSE)))

ces_full <- merge(ces_full, ces_immstat, all.x=TRUE, by.x=c("year", "case_id"), by.y=c("year", "caseid"))

## Citizenship variables are only on single for some years

ces_17 <- read_dta("Data/CES_2017.dta")
ces_19 <- read_dta("Data/CCES19_Common_OUTPUT.dta")
ces_21 <- read_dta("Data/CCES21_Common_OUTPUT.dta")
ces_23 <- read_dta("Data/CCES23_Common_OUTPUT.dta")
ces_20 <- read_csv("Data/CES20_Common_OUTPUT_vv.csv")
ces_22 <- read_csv("Data/CES22_Common.csv")



```


## LNPS

```{r LNPS 1989}

lnps <- lnps_89

names(lnps_89)

# Citizens
diag_df <- lnps %>%
  group_by(CITIZEN) %>%
  reframe(count=n())

## 0 = nativeborn, 1 = naturalized

lnps <- lnps %>%
  mutate(citizen = ifelse(CITIZEN %in% c(0,1),
                          1,
                          0))

# Latinos only
lnps <- subset(lnps, RGROUP %in% c(1,2,3,5))

lnps$sex <- ""
lnps$sex[lnps$GENDER == 1] <- "M"
lnps$sex[lnps$GENDER == 2] <- "F"

#Coding for Mexican Yes=1 No=0
lnps$Mexican <-0
lnps$Mexican[lnps$RGROUP==1|lnps$RGROUP==5] <-1 #Combination of Mexican and Cuban included
lnps$Mexican[lnps$MEXICAN == 1 | lnps$MEXICANO == 1 | lnps$MEXAMER == 1 | lnps$OTHERID == 6 | lnps$CHICANO == 1] <- 1 # Gets 6 more people who are mexican but not classified as such in the lnps groupings

#Coding for US Born
lnps$USborn <- NA
lnps$USborn[!(lnps$RCOUNTRY %in% c(1,97,98,99))] <- 0
lnps$USborn[lnps$RCOUNTRY==1] <- 1 #Doesn't include born in Puerto Rico (RCOUNTRY=2)- 4 Mex
#Coding for Generation (at least 1 Grandparent born outside U.S.)

lnps$USborn_parents <- NA
lnps$USborn_parents[lnps$USborn == 0 | (lnps$FATHBPLC != 1 & lnps$MOMBPLC != 1)] <- 0
lnps$USborn_parents[lnps$FATHBPLC == 1 & lnps$MOMBPLC == 1] <- 2
lnps$USborn_parents[(lnps$FATHBPLC == 1 | lnps$MOMBPLC == 1) & is.na(lnps$USborn_parents)] <- 1
lnps$USborn_parents[!(lnps$FATHBPLC %in% c(97,98,99)) & !(lnps$MOMBPLC %in% c(97,98,99)) & is.na(lnps$USborn_parents)] <- 0

lnps$USborn_gparents <- NA
lnps$USborn_gparents[lnps$USborn == 0 | lnps$USborn_parents == 0] <- 0
lnps$fpat <- 0
lnps$fpat[lnps$FPATBPLC == 1] <- 1
lnps$fmat <- 0
lnps$fmat[lnps$FMATBPLC == 1] <- 1
lnps$mmat <- 0
lnps$mmat[lnps$MMATBPLC == 1] <- 1
lnps$mpat <- 0
lnps$mpat[lnps$MFATBPLC == 1] <- 1
lnps$gparents <- lnps$fpat + lnps$fmat + lnps$mmat + lnps$mpat
lnps$USborn_gparents[is.na(lnps$USborn_gparents)] <- lnps$gparents[is.na(lnps$USborn_gparents)]

#Create Age Variable
lnps$age <- NA
lnps$age <- 90-lnps$BYEAR
lnps$age[lnps$age<=0] <- NA #Gets rid of "negative" ages for codes of DKs and refused
#summary(lnps$AGE)

## Get region variables in there.
lnps2 <- read.csv("Data/lnpsregion.csv", stringsAsFactors=FALSE)
lnps2 <- subset(lnps2, select=c(caseoff, region))
lnps <- merge(lnps, lnps2, by.x=c("CASEOFF"), by.y=c("caseoff"))
lnps$state <- ""
lnps$state[lnps$region == "FLORIDA"] <- "FL"

#test <- subset(lnps, select=c(LISTING, HOUSUNIT, region, RGROUP, MXDENS, PRDENS, CBDENS))
#write.csv(test, "LNPSregion.csv", na="", row.names=FALSE)
#WEST COAST = California, Portland
#SOUTHWEST = TX, AZ, NM, CO, NV # pg 28 of codebook
#testW <- subset(test, region == "WEST COAST")
#testF <- subset(test, region == "FLORIDA")
#table(testW$LISTING)

diag_df <- lnps_89 %>%
  group_by(PARTISAN, CLOSEPAR) %>%
  reframe(count=n())

lnps$Party <-NA
lnps$Party[lnps$PARTISAN==1 | lnps$CLOSEPAR==2] <- "Democrat"
lnps$Party[lnps$PARTISAN==2 | lnps$CLOSEPAR==1] <- "Republican" 
lnps$Party[lnps$PARTISAN==3 & is.na(lnps$Party)] <- "Independent"
lnps$Party[lnps$PARTISAN==4 & is.na(lnps$Party)] <- "Other"
lnps$Party7 <- lnps$Party
lnps$Party7[lnps$STRDEMO == 1] <- "Strong Democrat"
lnps$Party7[lnps$STREPUB == 1] <- "Strong Republican"
lnps$Party7[lnps$STRDEMO == 2] <- "Weak Democrat"
lnps$Party7[lnps$STREPUB == 2] <- "Weak Republican"
lnps$Party7[lnps$CLOSEPAR == 2] <- "Lean Democrat"
lnps$Party7[lnps$CLOSEPAR == 1] <- "Lean Republican"
lnps$Party7[lnps$CLOSEPAR == 3] <- "Independent"


diag_df <- lnps %>%
  group_by(PARTISAN, CLOSEPAR, Party, Party7) %>%
  reframe(count=n())


lnps$Year <- 1989
lnps$Survey <- "LNPS"
lnps$id <- lnps$CASEOFF
lnps$Weight <- lnps$SFWT  #subgroup (Mexican) specific
lnps$Weight_AllLatinos <- lnps$FWT

lnps$date <- lnps$DATE
lnps$birthyear <- lnps$BYEAR + 1900
lnps$birthyear[lnps$birthyear > 1974] <- NA


lnps$education <- NA
lnps$education[lnps$REDUC %in% c(0,1,2,3,4,5,6,7,8,9,10,11) | lnps$HISCHOOL == 2] <- "NoHS"
lnps$education[lnps$REDUC %in% c(12) | lnps$HISCHOOL == 1 | lnps$HIDEGREE %in% c(1,2)] <- "HSOnly"
lnps$education[lnps$HIDEGREE %in% c(seq(10,51,1))] <- "CollGrad"
lnps$education[(lnps$HIDEGREE %in% c(5) | lnps$REDUC %in% c(13,14,15,16,17)) & is.na(lnps$education)] <- "SomeColl_Other" # Includes Associate


## Check for full origin variables

diag_df <- lnps %>%
  group_by(RGROUP) %>%
  reframe(count=n())

## Only Mexican, PR, Cuban

lnps$origin <- ""
lnps$origin[lnps$RGROUP == 1 | lnps$RGROUP == 5] <- "Mexican" # One guy, listed as Mexican + Cuban, is "Hispano"
lnps$origin[lnps$RGROUP == 2] <- "Puerto Rican"
lnps$origin[lnps$RGROUP == 3] <- "Cuban"

lnps <- subset(lnps, select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- lnps

```

## Pew

```{r Kaiser Pew 1999}

data99 <- kaiser_99

names(data99)

data99 <- subset(data99, q001 == 1) # Latinos only

## Only citizens

diag_df <- data99 %>%
  group_by(q047) %>%
  reframe(count=n())

data99 <- data99 %>%
  mutate(citizen = if_else(q047 %in% c("", 1),
                 1,
                 0))

#table(data99$q006) # US-Born = 1, non-US Born = 2
#table(data99$q003_18) # 1 = Mexican, 0 = Non-Mexican, 9 = not responding
#table(data99$q102) # age in years, 99=Refused
#table(data99$q103) # Born in other country: 1 = one parent, 2 = both parents, 3 = no, 4 born in PR, 8 = Don't know
#table(data99$q104) # Born in other country: 0 = no grandparents, 1 = one g'parent, 2 = two g'parents, 3 three gparents, 4 = all g'parents, 8 = Don't know
#table(data99$q085) # 3 point PID, 1=R, 2=D, 3=I, 7=Something Else, 8=DK, 9=Refused
#table(data99$q086) # Strong democrat = 1, not strong = 2
#table(data99$q087) # Strong Republican = 1, not strong = 2
#table(data99$q088) # Lean r = 1, lean d = 2, neither = 3, other = 7, DK = 8
#table(data99$partlean) # partisanship with leaners, 1 = R, 2 = D, 3 = I , 7 = Other, 8 = DK
#data99_reduced <- subset(as.data.frame(data99), select=c(q006, q003_18, q102, q103, q104, q085, q086, q087, q088, partlean, gender, weight, id, ident))

data99$sex <- ""
data99$sex[data99$gender == 1] <- "M"
data99$sex[data99$gender == 2] <- "F"
data99$state <- data99$samp10
data99$USborn <- 0
data99$USborn[data99$q006 == 1] <- 1 # does not include born in PR
data99$USborn_parents <- NA
data99$USborn_parents[data99$USborn == 0 | (data99$q103 %in% c(2,4))] <- 0 # does not include born in PR
data99$USborn_parents[data99$q103 %in% c(1)] <- 1
data99$USborn_parents[data99$q103 %in% c(3)] <- 2
data99$USborn_gparents <- NA
data99$USborn_gparents[data99$USborn == 0 | (data99$q103 %in% c(2,4)) | (data99$q104 %in% c(4))] <- 0 # does not include born in PR
data99$USborn_gparents[data99$q104 %in% c(3)] <- 1
data99$USborn_gparents[data99$q104 %in% c(2)] <- 2
data99$USborn_gparents[data99$q104 %in% c(1)] <- 3
data99$USborn_gparents[data99$q104 %in% c(0)] <- 4

data99$age[data99$age == 99] <- NA

data99$Mexican <- 0
data99$Mexican[data99$q003_18 == 1] <- 1

diag_df <- kaiser_99 %>%
  group_by(q085, q086, q087, q088) %>%
  reframe(count=n())

data99$Party <- NA
data99$Party[data99$q085 == 2 | data99$q088 == 2] <- "Democrat"
data99$Party[data99$q085 == 1 | data99$q088 == 1] <- "Republican"
data99$Party[(data99$q085 == 3 & is.na(data99$Party))] <- "Independent"
data99$Party[(data99$q085 %in% c(7,8,9) & is.na(data99$Party)) | (data99$q088 %in% c(7,8,9) & is.na(data99$Party))] <- "Other"
data99$Party7 <- data99$Party
data99$Party7[data99$q086 == 1] <- "Strong Democrat"
data99$Party7[data99$q086 == 2] <- "Weak Democrat"
data99$Party7[data99$q087 == 1] <- "Strong Republican"
data99$Party7[data99$q087 == 2] <- "Weak Republican"
data99$Party7[data99$q088 == 1] <- "Lean Republican"
data99$Party7[data99$q088 == 2] <- "Lean Democrat"
data99$Party7[data99$q088 == 3] <- "Independent"
data99$Party7[data99$q088 %in% c(7, 8, 9)] <- "Other"


diag_df <- data99 %>%
  group_by(q085, q086, q087, q088, Party, Party7) %>%
  reframe(count=n())


data99$Year <- 1999
data99$Survey <- "Kaiser"
data99$id <- paste(data99$id, data99$ident, sep="-")
data99$Weight <- (nrow(data99)/sum(data99$weight))*data99$weight # convert pop weight to prob weight

data99$date <- 80199 # June 30 - August 30, 1999. This is the midpoint.

data99$education <- NA
data99$education[data99$q099 %in% c(1,2)] <- "NoHS"
data99$education[data99$q099 %in% c(3)] <- "HSOnly"
data99$education[data99$q099 %in% c(6,7)] <- "CollGrad"
data99$education[data99$q099 %in% c(4,5)] <- "SomeColl_Other"

## Checking origin

diag_df <- data99 %>% 
  group_by(q003_12,
           q003_09,
           q003_10,
           q003_18,
           q003_24) %>%
  reframe(count=n())

data99$origin <- "Other"
data99$origin[data99$q003_18 == 1] <- "Mexican"
data99$origin[data99$q003_24 == 1] <- "Puerto Rican"
data99$origin[data99$q003_09 == 1] <- "Cuban"
data99$origin[data99$q003_10 == 1] <- "Dominican"
data99$origin[data99$q003_12 == 1] <- "Salvadoran"


diag_df <- data99 %>%
  group_by(origin) %>%
  reframe(count=n())


## Add empty variables

data99$Weight_AllLatinos <- NA
data99$region <- NA
data99$birthyear <- NA


data99 <- subset(as.data.frame(data99), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))


names(df_collected)
names(data99)


df_collected <- rbind(df_collected,
                      data99)

```


```{r Pew 2002}

data02 <- pew_02

names(data02)

diag_df <- data02 %>%
  group_by(QN68, CITIZEN2) %>%
  reframe(count=n())

data02 <- subset(data02, QN1 == 1) # Latinos only

data02 <- data02 %>%
  mutate(citizen = ifelse(CITIZEN2 == 1, 1, 0))

data02$sex <- ""
data02$sex[data02$QN114 == 1] <- "M"
data02$sex[data02$QN114 == 2] <- "F"
data02$state <- data02$STATE

data02$USborn <- 0
data02$USborn[data02$QN3 == 1] <- 1 # does not include born in PR
data02$USborn_parents <- NA
data02$USborn_parents[data02$USborn == 0 | (data02$QN106 %in% c(3,4,5))] <- 0 # does not include born in PR
data02$USborn_parents[data02$QN106 %in% c(1,2)] <- 1
data02$USborn_parents[data02$QN106 %in% c(3)] <- 2
data02$USborn_gparents <- NA
data02$USborn_gparents[data02$USborn == 0 | (data02$QN106 %in% c(3,4,5)) | (data02$QN109 %in% c(4))] <- 0 # does not include born in PR
data02$USborn_gparents[data02$QN109 %in% c(3)] <- 1
data02$USborn_gparents[data02$QN109 %in% c(2)] <- 2
data02$USborn_gparents[data02$QN109 %in% c(1)] <- 3
data02$USborn_gparents[data02$QN109 %in% c(0)] <- 4

data02$age <- data02$QN105
data02$age[data02$QN105 == 99] <- NA

data02$Mexican <- 0
data02$Mexican[data02$QN5_18 == 18 | data02$QN4 == 18] <- 1

diag_df <- pew_02 %>%
  group_by(QN90, QN91) %>%
  reframe(count=n())

data02$Party <- NA
data02$Party[data02$QN90 == 2 | data02$QN91 == 2] <- "Democrat"
data02$Party[data02$QN90 == 1 | data02$QN91 == 1] <- "Republican"
data02$Party[(data02$QN90 == 3 & is.na(data02$Party))] <- "Independent"
data02$Party[(data02$QN90 %in% c(7,8,9) & is.na(data02$Party)) | (data02$QN91 %in% c(7,8,9) & is.na(data02$Party))] <- "Other"
data02$Party7 <- data02$Party
data02$Party7[data02$QN91 == 1] <- "Lean Republican"
data02$Party7[data02$QN91 == 2] <- "Lean Democrat"
data02$Party7[data02$QN91 == 0] <- "Independent"
data02$Party7[data02$QN91 %in% c(7,8,9)] <- "Other"

diag_df <- data02 %>%
  group_by(Party, Party7, QN90, QN91) %>%
  reframe(count=n())


data02$Year <- 2002
data02$Survey <- "Kaiser-Pew"
data02$id <- data02$ID
data02$Weight <- data02$WEIGHTH

data02$date <- 50802 # April 4 - June 11, 2002. This is the midpoint.

data02$education <- NA
data02$education[data02$QN101 %in% c(1,2)] <- "NoHS"
data02$education[data02$QN101 %in% c(3,4)] <- "HSOnly"
data02$education[data02$QN101 %in% c(7,8)] <- "CollGrad"
data02$education[data02$QN101 %in% c(5,6)] <- "SomeColl_Other" # includes non-4 year degree holders

## Checking national origin
# diag_df <- data02 %>%
#   group_by(QN5_18,
#            QN5_24,
#            QN5_09
#            ) %>%
#   reframe(count=n())


diag_df <- data02 %>%
  group_by(QN5_18,
           QN5_24) %>%
  reframe(count=n())

data02$origin <- "Other"
data02$origin[data02$QN5_18 == 18 | data02$QN4 == 18] <- "Mexican"
data02$origin[data02$QN5_24 == 24 | data02$QN4 == 24 | data02$QN3 == 2] <- "Puerto Rican"
data02$origin[data02$QN5_09 == 9 | data02$QN4 == 9] <- "Cuban"
data02$origin[data02$QN5_10 == 10 | data02$QN4 == 10] <- "Dominican"
data02$origin[data02$QN5_09 == 12 | data02$QN4 == 12] <- "Salvadoran"

diag_df <- data02 %>%
  group_by(origin) %>%
  reframe(count=n())

## Add empty variables

names(df_collected)

data02$Weight_AllLatinos <- NA
data02$region <- NA
data02$birthyear <- NA

data02 <- subset(as.data.frame(data02), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data02)

```



```{r Pew 2004 }


data04 <- pew_04

names(data04)

## Citizen

diag_df <- data04 %>%
  group_by(QN17) %>%
  reframe(count=n())

data04 <- data04 %>%
  mutate(citizen = ifelse(QN17 == 1 | is.na(QN17) == TRUE,
                1,
                0))

diag_df <- data04 %>%
  group_by(QN17, citizen) %>%
  reframe(count=n())

#QND18 = gender, 1=male
#QN5, 1=Mexican
#QN3, 1= Born in US, 2=PR, 3=Other country, 8=DK, 9=Ref
#QN64 = 3 point PID, 1=R, 2=D, 3=I, 7=Something else, 8=DK, 9=Refused
#QN65 = Neither = 0, Lean r = 1, lean d = 2 , other = 7, DK = 8
#QN75 = Age, 99=Refused
#QN76 = Parents born in PR, 1=mother, 2=father, 3=both, 4-no
#QN77 = Parents born outside US, 1=mother, 2=father, 3=both, 4=no
# STATE = State Abbreviation
# WEIGHT = weight

data04$sex <- ""
data04$sex[data04$QND18 == 1] <- "M"
data04$sex[data04$QND18 == 2] <- "F"
data04$state <- data04$STATE
data04$USborn <- NA
data04$USborn[data04$QN3 %in% c(2,3)] <- 0 
data04$USborn[data04$QN3 == 1] <- 1 # does not include born in PR
data04$USborn_parents <- NA
data04$USborn_parents[data04$USborn == 0 | (data04$QN76 %in% c(3)) | (data04$QN77 %in% c(3))] <- 0 # does not include born in PR
data04$USborn_parents[(data04$QN76 %in% c(1,2)) | (data04$QN77 %in% c(1,2))] <- 1
data04$USborn_parents[(data04$QN76 %in% c(4)) | (data04$QN77 %in% c(4))] <- 2
data04$USborn_gparents <- NA
data04$USborn_gparents[data04$USborn == 0 | data04$USborn_parents == 0] <- 0

data04$age <- data04$QN75
data04$age[data04$age == 99] <- NA

data04$Mexican <- 0
data04$Mexican[data04$QN5 == 1] <- 1


diag_df <- pew_04 %>%
  group_by(QN64, QN65) %>%
  reframe(count=n())

data04$Party <- NA
data04$Party[data04$QN64 == 2 | data04$QN65 == 2] <- "Democrat"
data04$Party[data04$QN64 == 1 | data04$QN65 == 1] <- "Republican"
data04$Party[(data04$QN64 == 3 & is.na(data04$Party))] <- "Independent"
data04$Party[(data04$QN64 == 7 & is.na(data04$Party)) | (data04$QN65 == 7 & is.na(data04$Party))] <- "Other"
data04$Party7 <- data04$Party
data04$Party7[data04$QN65 == 1] <- "Lean Republican"
data04$Party7[data04$QN65 == 2] <- "Lean Democrat"
data04$Party7[data04$QN65 == 0] <- "Independent"
data04$Party7[data04$QN65 %in% c(7,8,9)] <- "Other"

diag_df <- data04 %>%
  group_by(Party, Party7, QN64, QN65) %>%
  reframe(count=n())


data04$Year <- 2004
data04$Survey <- "Pew"
data04$id <- data04$ID
data04$Weight <- data04$WEIGHT

data04$date <- 51304 # April 21 - June 9, 2004. This is the midpoint.

data04$education <- NA
data04$education[data04$QN74 %in% c(1,2)] <- "NoHS"
data04$education[data04$QN74 %in% c(3,4)] <- "HSOnly"
data04$education[data04$QN74 %in% c(7,8)] <- "CollGrad"
data04$education[data04$QN74 %in% c(5,6)] <- "SomeColl_Other" # includes non-4 year degree holders

diag_df <- data04 %>%
  group_by(QN5) %>%
  reframe(count=n())


data04$origin <- "Other"
data04$origin[data04$QN5 == 1] <- "Mexican"
data04$origin[data04$QN5 == 2 | data04$QN3 == 2 | data04$QN76 %in% c(3)] <- "Puerto Rican"
data04$origin[data04$QN5 == 3] <- "Cuban"
data04$origin[data04$QN5 == 4] <- "Dominican"
data04$origin[data04$QN5 == 5] <- "Salvadoran"
data04$origin[data04$QN5 == 6] <- "Other Central American"
data04$origin[data04$QN5 == 7] <- "Other South American"
data04$origin[data04$QN5 == 34] <- "Spain"
data04$origin[data04$QN5 == 35] <- "Portugal"

diag_df <- data04 %>%
  group_by(QN5, origin) %>%
  reframe(count=n())


## Add empty variables

names(df_collected)

data04$Weight_AllLatinos <- NA
data04$region <- NA
data04$birthyear <- NA

data04 <- subset(as.data.frame(data04), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data04)

```



```{r Pew 2006}

data06 <- pew_06


## Citizen

diag_df <- data06 %>%
  group_by(qn5) %>%
  reframe(count=n())

data06 <- data06 %>%
  mutate(citizen = ifelse(qn5 == 1 | is.na(qn5)==TRUE,
                          1,
                          0))

diag_df <- data06 %>%
  group_by(qn5, citizen) %>%
  reframe(count=n())

#qns5 = gender, 1=male
#qns10, 1=Mexican
#qns8, 1= Born in US, 2=PR, 3=Other country, 9=Ref
#qn45 = 3 point PID, 1=R, 2=D, 3=I, 7=Something else, 8=DK, 9=Refused
#qns46 = other = 0, Lean r = 1, lean d = 2 , neither = 3, DK = 8 (note: only asked if qn45=3)
#qn63 = Age, 99=Refused
#qn64 = Parents born outside US, 1=mother, 2=father, 3=both, 4=no, 5=Puerto Rico
# region = census region, 1=northeast, 2=north central, 3=south, 4=west
# newwght = weight
#qns9 = Country R was born in

data06$sex <- ""
data06$sex[data06$qns5 == 1] <- "M"
data06$sex[data06$qns5 == 2] <- "F"
data06$state <- NA
data06$USborn <- NA
data06$USborn[data06$qns8 %in% c(2,3)] <- 0 
data06$USborn[data06$qns8 == 1] <- 1 # does not include born in PR
data06$USborn_parents <- NA
data06$USborn_parents[data06$USborn == 0 | (data06$qn64 %in% c(3, 5))] <- 0 # does not include born in PR
data06$USborn_parents[(data06$qn64 %in% c(1,2))] <- 1
data06$USborn_parents[(data06$qn64 %in% c(4))] <- 2
data06$USborn_gparents <- NA
data06$USborn_gparents[data06$USborn == 0 | data06$USborn_parents == 0] <- 0

data06$age <- data06$qn63
data06$age[data06$age == 99] <- NA

data06$Mexican <- 0
data06$Mexican[data06$qns9 == 18 | data06$qns10 == 1] <- 1

diag_df <-pew_06 %>%
  group_by(qn45, qns46) %>%
  reframe(count=n())

data06$Party <- NA
data06$Party[data06$qn45 == 2 | data06$qns46 == 2] <- "Democrat"
data06$Party[data06$qn45 == 1 | data06$qns46 == 1] <- "Republican"
data06$Party[(data06$qn45 == 3 & is.na(data06$Party))] <- "Independent"
data06$Party[(data06$qn45 %in% c(7,8,9) & is.na(data06$Party)) | (data06$qns46 == 0 & is.na(data06$Party))] <- "Other"
data06$Party7 <- data06$Party
data06$Party7[data06$qns46 == 1] <- "Lean Republican"
data06$Party7[data06$qns46 == 2] <- "Lean Democrat"
data06$Party7[data06$qns46 == 3] <- "Independent"
data06$Party7[data06$qns46 %in% c(7,8,9)] <- "Other"

diag_df <-data06 %>%
  group_by(Party, Party7, qn45, qns46) %>%
  reframe(count=n())

data06$Year <- 2006
data06$Survey <- "Pew"
data06$id <- data06$caseid
data06$Weight <- data06$newwght
data06$region <- data06$region

data06$date <- 61706 # June 5 - July 3, 2006. This is the midpoint.

data06$education <- NA
data06$education[data06$qn59 %in% c(1,2)] <- "NoHS"
data06$education[data06$qn59 %in% c(3,4)] <- "HSOnly"
data06$education[data06$qn59 %in% c(7,8)] <- "CollGrad"
data06$education[data06$qn59 %in% c(5,6)] <- "SomeColl_Other" # includes non-4 year degree holders

data06$origin <- "Other"
data06$origin[data06$qns9 == 18 | data06$qns10 == 1] <- "Mexican"
data06$origin[data06$qns9 == 24 | data06$qns10 == 2 | data06$qns8 == 2] <- "Puerto Rican"
data06$origin[data06$qns9 == 9 | data06$qns10 == 3] <- "Cuban"


## Add empty variables

names(df_collected)

data06$Weight_AllLatinos <- NA
data06$birthyear <- NA

data06 <- subset(as.data.frame(data06), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data06)


```




```{r Pew 2007}

data07 <- pew_07 

## Citizenship

diag_df <- pew_07 %>%
  group_by(qn9) %>%
  reframe(count=n())

data07 <- data07 %>%
  mutate(citizen = ifelse(is.na(qn9)== TRUE |
         qn9 == 1, 1, 0))

diag_df <- data07 %>%
  group_by(qn9, citizen) %>%
  reframe(count=n())

#qnd18 = gender, 1=male
#qn4, 1=Mexican
#qn5, 1= PR, 2=US, 3=Other country, 98=DK, 99=Ref
#qn17 = 3 point PID, 1=R, 2=D, 3=I, 7=Something else, 8=DK, 9=Refused
#qn18 = Lean r = 1, lean d = 2 , neither = 3, other = 4, DK = 8, ref=9
#qn50 = Age, 98=DK, 99=Refused
#qn7 = Mother born outside US, 1=PR, 2=US, 3=other country, 98=DK, 99=Ref
#qn8 = Father born outside US, 1=PR, 2=US, 3=other country, 98=DK, 99=Ref
# sample03 = census region, 1=northeast, 2=north central, 3=south, 4=west
# weight = weight
#qn5a = Country R was born in

data07$sex <- ""
data07$sex[data07$qnd18 == 1] <- "M"
data07$sex[data07$qnd18 == 2] <- "F"
data07$state <- NA
data07$USborn <- NA
data07$USborn[data07$qn5 %in% c(1,3)] <- 0 
data07$USborn[data07$qn5 == 2] <- 1 # does not include born in PR
data07$USborn_parents <- NA
data07$USborn_parents[data07$USborn == 0 | (data07$qn7 %in% c(1, 3) & data07$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data07$USborn_parents[(data07$qn7 == 2) | (data07$qn8 == 2)] <- 1
data07$USborn_parents[(data07$qn7 == 2) & (data07$qn8 == 2)] <- 2
data07$USborn_gparents <- NA
data07$USborn_gparents[data07$USborn == 0 | data07$USborn_parents == 0] <- 0

data07$age <- data07$qn50
data07$age[data07$age == 99] <- NA

data07$Mexican <- 0
data07$Mexican[data07$qn4 == 1] <- 1

diag_df <- data07 %>%
  group_by(qn17, qn18) %>%
  reframe(count=n())

data07$Party <- NA
data07$Party[data07$qn17 == 2 | data07$qn18 == 2] <- "Democrat"
data07$Party[data07$qn17 == 1 | data07$qn18 == 1] <- "Republican"
data07$Party[(data07$qn17 == 3 & is.na(data07$Party))] <- "Independent"
data07$Party[(data07$qn17 == 7 & is.na(data07$Party)) | (data07$qn18 == 4 & is.na(data07$Party))] <- "Other"
data07$Party7 <- data07$Party
data07$Party7[data07$qn18 == 1] <- "Lean Republican"
data07$Party7[data07$qn18 == 2] <- "Lean Democrat"
data07$Party7[data07$qn18 == 3] <- "Independent"
data07$Party7[data07$qn18 %in% c(8,9,4)] <- "Other"

diag_df <- data07 %>%
  group_by(Party, Party7, qn17, qn18) %>%
  reframe(count=n())


data07$Year <- 2007
data07$Survey <- "Pew"
data07$id <- data07$id
data07$Weight <- data07$weight
data07$region <- data07$sample03

data07$date <- 101807 # Oct 3 - Nov 9, 2007. This is the midpoint.

data07$education <- NA
data07$education[data07$qn51 %in% c(1,2)] <- "NoHS"
data07$education[data07$qn51 %in% c(3,4)] <- "HSOnly"
data07$education[data07$qn51 %in% c(7,8)] <- "CollGrad"
data07$education[data07$qn51 %in% c(5,6)] <- "SomeColl_Other" # includes non-4 year degree holders

data07$origin <- "Other"
data07$origin[data07$qn4 == 1] <- "Mexican"
data07$origin[data07$qn4 == 2 | data07$qn5 == 1] <- "Puerto Rican"
data07$origin[data07$qn4 == 3] <- "Cuban"
data07$origin[data07$qn4 == 4] <- "Dominican"
data07$origin[data07$qn4 == 5] <- "Salvadoran"
data07$origin[data07$qn4 == 6] <- "Other Central American"
data07$origin[data07$qn4 == 7] <- "Other South American"
data07$origin[data07$qn4 == 8] <- "Other Caribbean"
data07$origin[data07$qn4 == 10] <- "Other European"


## Add empty variables

names(df_collected)

data07$Weight_AllLatinos <- NA
data07$birthyear <- NA

data07 <- subset(as.data.frame(data07), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data07)



```


```{r Pew 2008}

data08 <- pew_08

names(pew_08)

## Citizen
diag_df <- data08 %>%
  group_by(qn9) %>%
  reframe(count=n())

data08 <- data08 %>%
  mutate(citizen = ifelse(is.na(qn9) == TRUE | qn9 == 1,
                          1,
                          0))


#qnd18 = gender, 1=male
#qn4, 1=Mexican
#qn5, 1= PR, 2=US, 3=Other country, 98=DK, 99=Ref
#qn21 = 3 point PID, 1=R, 2=D, 3=I, 7=Something else, 8=DK, 9=Refused
#qn22 = Lean r = 1, lean d = 2 , neither = 3, other = 4, DK = 8, ref=9
#qn62 = Age, 98=DK, 99=Refused
#qn7 = Mother born outside US, 1=PR, 2=US, 3=other country, 98=DK, 99=Ref
#qn8 = Father born outside US, 1=PR, 2=US, 3=other country, 98=DK, 99=Ref
# samp13 = census region, 1=northeast, 2=north central, 3=south, 4=west
# weight = weight
#qn5a = Country R was born in

data08$sex <- ""
data08$sex[data08$qnd18 == 1] <- "M"
data08$sex[data08$qnd18 == 2] <- "F"
data08$state <- NA
data08$USborn <- NA
data08$USborn[data08$qn5 %in% c(1,3)] <- 0 
data08$USborn[data08$qn5 == 2] <- 1 # does not include born in PR
data08$USborn_parents <- NA
data08$USborn_parents[data08$USborn == 0 | (data08$qn7 %in% c(1, 3) & data08$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data08$USborn_parents[(data08$qn7 == 2) | (data08$qn8 == 2)] <- 1
data08$USborn_parents[(data08$qn7 == 2) & (data08$qn8 == 2)] <- 2
data08$USborn_gparents <- NA
data08$USborn_gparents[data08$USborn == 0 | data08$USborn_parents == 0] <- 0

data08$age <- data08$qn62
data08$age[data08$age == 99] <- NA

data08$Mexican <- 0
data08$Mexican[data08$qn4 == 1] <- 1

data08$Party <- NA
data08$Party[data08$qn21 == 2 | data08$qn22 == 2] <- "Democrat"
data08$Party[data08$qn21 == 1 | data08$qn22 == 1] <- "Republican"
data08$Party[(data08$qn21 == 3 & is.na(data08$Party))] <- "Independent"
data08$Party[(data08$qn21 == 7 & is.na(data08$Party)) | (data08$qn22 == 4 & is.na(data08$Party))] <- "Other"
data08$Party7 <- data08$Party
data08$Party7[data08$qn22 == 1] <- "Lean Republican"
data08$Party7[data08$qn22 == 2] <- "Lean Democrat"
data08$Party7[data08$qn22 == 3] <- "Independent"
data08$Party7[data08$qn22 %in% c(8,9,4)] <- "Other"

diag_df <- data08 %>%
  group_by(Party, Party7, qn21, qn22) %>%
  reframe(count=n())

data08$Year <- 2008
data08$Survey <- "Pew"
data08$id <- data08$cati_id
data08$Weight <- data08$weight
data08$region <- data08$samp13

data08$date <- 101807 # Oct 3 - Nov 9, 2007. This is the midpoint.

data08$education <- NA
data08$education[data08$qn63 %in% c(1,2)] <- "NoHS"
data08$education[data08$qn63 %in% c(3,4)] <- "HSOnly"
data08$education[data08$qn63 %in% c(7,8)] <- "CollGrad"
data08$education[data08$qn63 %in% c(5,6)] <- "SomeColl_Other" # includes non-4 year degree holders

data08$origin <- "Other"
data08$origin[data08$qn4 == 1] <- "Mexican"
data08$origin[data08$qn4 == 2 | data08$qn5 == 1] <- "Puerto Rican"
data08$origin[data08$qn4 == 3] <- "Cuban"
data08$origin[data08$qn4 == 4] <- "Dominican"
data08$origin[data08$qn4 == 5] <- "Salvadoran"
data08$origin[data08$qn4 == 6] <- "Other Central American"
data08$origin[data08$qn4 == 7] <- "Other South American"
data08$origin[data08$qn4 == 8] <- "Other Caribbean"
data08$origin[data08$qn4 == 10] <- "Other European"

## Add empty variables

names(df_collected)

data08$Weight_AllLatinos <- NA
data08$birthyear <- NA

data08 <- subset(as.data.frame(data08), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data08)


```


```{r Pew 2009}

data09 <- pew_09

names(data09)

## Citizen
diag_df <- data09 %>%
  group_by(qn9) %>%
  reframe(count=n())

data09 <- data09 %>%
  mutate(citizen = ifelse(is.na(qn9) == TRUE | qn9 == 1,
                          1,
                          0))

diag_df <- data09 %>%
  group_by(qn9, citizen) %>%
  reframe(count=n())

#gender = gender, 1=male
#qn4, 1=Mexican
#qn5, 1= PR, 2=US, 3=Other country, 8=DK, 9=Ref
#qn98 = 3 point PID, 1=R, 2=D, 3=I, 7=Something else, 8=DK, 9=Refused
#qn99 = Lean r = 1, lean d = 2 , neither = 3, other = 4, DK = 8, ref=9 (includes 7,8,9 in previous q)
#qn9a = Age, 98=DK, 99=Refused
#qn7 = Mother born outside US, 1=PR, 2=US, 3=other country, 8=DK, 9=Ref
#qn8 = Father born outside US, 1=PR, 2=US, 3=other country, 8=DK, 9=Ref
# sample20 = census region, 1=northeast, 2=north central, 3=south, 4=west
# weight = weight
#qn5a = Country R was born in

data09$sex <- ""
data09$sex[data09$gender == 1] <- "M"
data09$sex[data09$gender == 2] <- "F"
data09$state <- NA
data09$USborn <- NA
data09$USborn[data09$qn5 %in% c(1,3)] <- 0 
data09$USborn[data09$qn5 == 2] <- 1 # does not include born in PR
data09$USborn_parents <- NA
data09$USborn_parents[data09$USborn == 0 | (data09$qn7 %in% c(1, 3) & data09$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data09$USborn_parents[(data09$qn7 == 2) | (data09$qn8 == 2)] <- 1
data09$USborn_parents[(data09$qn7 == 2) & (data09$qn8 == 2)] <- 2
data09$USborn_gparents <- NA
data09$USborn_gparents[data09$USborn == 0 | data09$USborn_parents == 0] <- 0

data09$age <- data09$qn9a # has 16-17 year olds
data09$age[data09$age == 99] <- NA

data09$Mexican <- 0
data09$Mexican[data09$qn4 == 1] <- 1

data09$Party <- NA
data09$Party[data09$qn98 == 2 | data09$qn99 == 2] <- "Democrat"
data09$Party[data09$qn98 == 1 | data09$qn99 == 1] <- "Republican"
data09$Party[(data09$qn98 == 3 & is.na(data09$Party))] <- "Independent"
data09$Party[(data09$qn98 == 7 & is.na(data09$Party)) | (data09$qn99 == 4 & is.na(data09$Party))] <- "Other"
data09$Party7 <- data09$Party
data09$Party7[data09$qn99 == 1] <- "Lean Republican"
data09$Party7[data09$qn99 == 2] <- "Lean Democrat"
data09$Party7[data09$qn99 == 3] <- "Independent"
data09$Party7[data09$qn99 %in% c(8,9,4)] <- "Other"


diag_df <- data09 %>%
  group_by(qn98, qn99, Party, Party7) %>%
  reframe(count=n())


data09$Year <- 2009
data09$Survey <- "Pew"
data09$id <- data09$id
data09$Weight <- data09$weight
data09$region <- data09$sample20

data09$date <- 82009 # Aug 5- Sep 6, 2009. This is the midpoint.

data09$education <- NA
data09$education[data09$qn13 %in% c(1,2)] <- "NoHS"
data09$education[data09$qn13 %in% c(3)] <- "HSOnly"
data09$education[data09$qn13 %in% c(6,7)] <- "CollGrad"
data09$education[data09$qn13 %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

data09$origin <- "Other"
data09$origin[data09$qn4 == 1] <- "Mexican"
data09$origin[data09$qn4 == 2 | data09$qn5 == 1] <- "Puerto Rican"
data09$origin[data09$qn4 == 3] <- "Cuban"
data09$origin[data09$qn4 == 4] <- "Dominican"
data09$origin[data09$qn4 == 5] <- "Salvadoran"
data09$origin[data09$qn4 == 6] <- "Other Central American"
data09$origin[data09$qn4 == 7] <- "Other South American"
data09$origin[data09$qn4 == 8] <- "Other Caribbean"
data09$origin[data09$qn4 == 10] <- "Other European"

## Add empty variables

names(df_collected)

data09$Weight_AllLatinos <- NA
data09$birthyear <- NA

data09 <- subset(as.data.frame(data09), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data09)


```



```{r Pew 2010}

data10 <- pew_10

## Citizen
diag_df <- data10 %>%
  group_by(qn9) %>%
  reframe(count=n())

data10 <- data10 %>%
  mutate(citizen = ifelse(is.na(qn9) == TRUE | qn9 == 1,
                          1,
                          0)
  )

## Citizen
diag_df <- data10 %>%
  group_by(qn9, citizen) %>%
  reframe(count=n())

data10$sex <- ""
data10$sex[data10$gender == 1] <- "M"
data10$sex[data10$gender == 2] <- "F"
data10$state <- NA
data10$USborn <- NA
data10$USborn[data10$qn4 %in% c(1,3)] <- 0 
data10$USborn[data10$qn4 == 2] <- 1 # does not include born in PR
data10$USborn_parents <- NA
data10$USborn_parents[data10$USborn == 0 | (data10$qn7 %in% c(1, 3) & data10$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data10$USborn_parents[(data10$qn7 == 2) | (data10$qn8 == 2)] <- 1
data10$USborn_parents[(data10$qn7 == 2) & (data10$qn8 == 2)] <- 2
data10$USborn_gparents <- NA
data10$USborn_gparents[data10$USborn == 0 | data10$USborn_parents == 0] <- 0

data10$age <- data10$qn77
data10$age[data10$age == 99] <- 99
data10$age[data10$age == 1] <- 24
data10$age[data10$age == 2] <- 40
data10$age[data10$age == 3] <- 57
data10$age[data10$age %in% c(99, 4, 9)] <- NA

data10$Mexican <- 0
data10$Mexican[data10$qn3 == 1] <- 1

diag_df <- data10 %>%
  group_by(qn65, qn66) %>%
  reframe(count=n())

data10$Party <- NA
data10$Party[data10$qn65 == 2 | data10$qn66 == 2] <- "Democrat"
data10$Party[data10$qn65 == 1 | data10$qn66 == 1] <- "Republican"
data10$Party[(data10$qn65 == 3 & is.na(data10$Party))] <- "Independent"
data10$Party[(data10$qn65 %in% c(4,8,9,7) & is.na(data10$Party)) | (data10$qn66 == 4 & is.na(data10$Party))] <- "Other"
data10$Party7 <- data10$Party
data10$Party7[data10$qn66 == 1] <- "Lean Republican"
data10$Party7[data10$qn66 == 2] <- "Lean Democrat"
data10$Party7[data10$qn66 == 3] <- "Independent"
data10$Party7[data10$qn66 %in% c(4,7,8,9)] <- "Other"

diag_df <- data10 %>%
  group_by(Party, Party7, qn65, qn66) %>%
  reframe(count=n())


data10$Year <- 2010
data10$Survey <- "Pew"
data10$id <- data10$id
data10$Weight <- data10$weight
data10$region <- data10$sample20

data10$date <- 90210 # Aug 17 - Sep 19, 2010. This is the midpoint.

data10$education <- NA
data10$education[data10$qn75 %in% c(1,2)] <- "NoHS"
data10$education[data10$qn75 %in% c(3)] <- "HSOnly"
data10$education[data10$qn75 %in% c(6,7)] <- "CollGrad"
data10$education[data10$qn75 %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

## Origin

diag_df <- data10 %>%
  group_by(qn3) %>%
  reframe(count=n())

data10$origin <- "Other"
data10$origin[data10$qn3 == 1] <- "Mexican"
data10$origin[data10$qn3 == 2 | data10$qn4 == 1] <- "Puerto Rican"
data10$origin[data10$qn3 == 3] <- "Cuban"
data10$origin[data10$qn3 == 4] <- "Dominican"
data10$origin[data10$qn3 == 5] <- "Salvadoran"
# data10$origin[data10$qn3 == 6] <- "Other Central American"
# data10$origin[data10$qn3 == 7] <- "Other South American"
# data10$origin[data10$qn3 == 8] <- "Other Caribbean"
# data10$origin[data10$qn3 == 10] <- "Other European"


## Add empty variables

names(df_collected)

data10$Weight_AllLatinos <- NA
data10$birthyear <- NA

data10 <- subset(as.data.frame(data10), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data10)

```


```{r Pew 2011}

data11 <- pew_11

names(data11)

## Citizen
# diag_df <- data11 %>%
#   group_by(qn9) %>%
#   reframe(count=n())

## NOT RECORDED IN DATASET

data11$citizen <- NA

names(data11)

data11$sex <- ""
data11$sex[data11$gender == 1] <- "M"
data11$sex[data11$gender == 2] <- "F"
data11$state <- NA
data11$USborn <- NA
data11$USborn[data11$qn4 %in% c(1,3)] <- 0 
data11$USborn[data11$qn4 == 2] <- 1 # does not include born in PR
data11$USborn_parents <- NA
data11$USborn_parents[data11$USborn == 0 | (data11$qn7 %in% c(1, 3) & data11$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data11$USborn_parents[(data11$qn7 == 2) | (data11$qn8 == 2)] <- 1
data11$USborn_parents[(data11$qn7 == 2) & (data11$qn8 == 2)] <- 2
data11$USborn_gparents <- NA
data11$USborn_gparents[data11$USborn == 0 | data11$USborn_parents == 0] <- 0

data11$age <- data11$qn95
data11$age[data11$age == 99] <- 99
data11$age[data11$age == 1] <- 24
data11$age[data11$age == 2] <- 40
data11$age[data11$age == 3] <- 57
data11$age[data11$age %in% c(99, 4, 9)] <- NA

data11$Mexican <- 0
data11$Mexican[data11$qn301 == 1] <- 1

data11$Party <- NA
data11$Party[data11$qn81 == 2 | data11$qn82 == 2] <- "Democrat"
data11$Party[data11$qn81 == 1 | data11$qn82 == 1] <- "Republican"
data11$Party[(data11$qn81 == 3 & is.na(data11$Party))] <- "Independent"
data11$Party[(data11$qn81 %in%c(4,7,8,9) & is.na(data11$Party)) | (data11$qn82 %in%c(4,7,8,9) & is.na(data11$Party))] <- "Other"
data11$Party7 <- data11$Party
data11$Party7[data11$qn82 == 1] <- "Lean Republican"
data11$Party7[data11$qn82 == 2] <- "Lean Democrat"
data11$Party7[data11$qn82 == 3] <- "Independent"
data11$Party7[data11$qn82 %in% c(4,7,8,9)] <- "Other"

diag2_df <- data11 %>%
  group_by(Party,Party7,qn81,qn82) %>%
  reframe(count=n())


data11$Year <- 2011
data11$Survey <- "Pew"
data11$id <- data11$caseid
data11$Weight <- data11$weight
data11$region <- data11$sample20

data11$date <- 112311 # Nov 9 - Dec 7, 2011. This is the midpoint.

data11$education <- NA
data11$education[data11$qn94 %in% c(1,2)] <- "NoHS"
data11$education[data11$qn94 %in% c(3)] <- "HSOnly"
data11$education[data11$qn94 %in% c(6,7)] <- "CollGrad"
data11$education[data11$qn94 %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

diag_df <- data11 %>%
  group_by(qn301) %>%
  reframe(count=n())

data11$origin <- "Other"
data11$origin[data11$qn301 == 1 | data11$qn302 == 1] <- "Mexican"
data11$origin[data11$qn301 == 2 | data11$qn301 == 2] <- "Puerto Rican"
data11$origin[data11$qn301 == 3 | data11$qn301 == 3] <- "Cuban"
data11$origin[data11$qn301 == 4 | data11$qn301 == 4] <- "Dominican"
data11$origin[data11$qn301 == 5 | data11$qn301 == 5] <- "Salvadoran"
# data11$origin[data11$qn3 == 6] <- "Other Central American"
# data11$origin[data11$qn3 == 7] <- "Other South American"
# data11$origin[data11$qn3 == 8] <- "Other Caribbean"
# data11$origin[data11$qn3 == 10] <- "Other European"


## Add empty variables

names(df_collected)

data11$Weight_AllLatinos <- NA
data11$birthyear <- NA

data11 <- subset(as.data.frame(data11), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data11)


```



```{r Pew 2012}

data12 <- pew_12

names(pew_12)

## Citizen
diag_df <- data12 %>%
  group_by(qn9) %>%
  reframe(count=n())

data12 <- data12 %>%
  mutate(citizen = ifelse(is.na(qn9) == TRUE | qn9 == 1,
                          1,
                          0))

diag_df <- data12 %>%
  group_by(qn9, citizen) %>%
  reframe(count=n())

data12$sex <- ""
data12$sex[data12$gender == 1] <- "M"
data12$sex[data12$gender == 2] <- "F"
data12$state <- NA
data12$USborn <- NA
data12$USborn[data12$qn4 %in% c(1,3)] <- 0 
data12$USborn[data12$qn4 == 2] <- 1 # does not include born in PR
data12$USborn_parents <- NA
data12$USborn_parents[data12$USborn == 0 | (data12$qn7 %in% c(1, 3) & data12$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data12$USborn_parents[(data12$qn7 == 2) | (data12$qn8 == 2)] <- 1
data12$USborn_parents[(data12$qn7 == 2) & (data12$qn8 == 2)] <- 2
data12$USborn_gparents <- NA
data12$USborn_gparents[data12$USborn == 0 | data12$USborn_parents == 0] <- 0

data12$age <- data12$qn73
data12$age[data12$age == 99] <- 99
data12$age[data12$age == 1] <- 24
data12$age[data12$age == 2] <- 40
data12$age[data12$age == 3] <- 57
data12$age[data12$age %in% c(99, 4, 9)] <- NA

data12$Mexican <- 0
data12$Mexican[data12$qn3 == 1] <- 1

diag_df <- data12 %>%
  group_by(qn61, qn62) %>%
  reframe(count=n())

data12$Party <- NA
data12$Party[data12$qn61 == 2 | data12$qn62 == 2] <- "Democrat"
data12$Party[data12$qn61 == 1 | data12$qn62 == 1] <- "Republican"
data12$Party[(data12$qn61 == 3 & is.na(data12$Party))] <- "Independent"
data12$Party[(data12$qn61 %in% c(4,7,8,9) & is.na(data12$Party)) | (data12$qn62 %in% c(4,7,8,9) & is.na(data12$Party))] <- "Other"
data12$Party7 <- data12$Party
data12$Party7[data12$qn62 == 1] <- "Lean Republican"
data12$Party7[data12$qn62 == 2] <- "Lean Democrat"
data12$Party7[data12$qn62 == 3] <- "Independent"
data12$Party7[data12$qn62 %in% c(4,7,8,9)] <- "Other"

diag2_df <- data12 %>%
  group_by(Party, Party7, qn61, qn62) %>%
  reframe(count=n())

data12$Year <- 2012
data12$Survey <- "Pew"
data12$id <- data12$id
data12$Weight <- data12$weight
data12$region <- data12$region

data12$date <- 92012 # Sep 7 - Oct 4, 2012. This is the midpoint.

data12$education <- NA
data12$education[data12$qn72 %in% c(1,2)] <- "NoHS"
data12$education[data12$qn72 %in% c(3)] <- "HSOnly"
data12$education[data12$qn72 %in% c(6,7,8)] <- "CollGrad"
data12$education[data12$qn72 %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

data12$origin <- "Other"
data12$origin[data12$qn3 == 1] <- "Mexican"
data12$origin[data12$qn3 == 2 | data12$qn4 == 1] <- "Puerto Rican"
data12$origin[data12$qn3 == 3] <- "Cuban"
data12$origin[data12$qn3 == 4] <- "Dominican"
data12$origin[data12$qn3 == 5] <- "Salvadoran"
# data12$origin[data12$qn3 == 6] <- "Other Central American"
# data12$origin[data12$qn3 == 7] <- "Other South American"
# data12$origin[data12$qn3 == 8] <- "Other Caribbean"
# data12$origin[data12$qn3 == 10] <- "Other European"

## Add empty variables

names(df_collected)

data12$Weight_AllLatinos <- NA
data12$birthyear <- NA

data12 <- subset(as.data.frame(data12), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data12)

```


```{r Pew 2013}

data13 <- pew_13

names(pew_13)

## Citizen
# diag_df <- data13 %>%
#   group_by(im9) %>%
#   reframe(count=n())

## No citizenship data

data13$citizen <- NA

# data08 <- data08 %>%
#   filter(is.na(qn9) == TRUE | qn9 == 1)

data13$sex <- ""
data13$sex[data13$s5 == 1] <- "M"
data13$sex[data13$s5 == 2] <- "F"
data13$state <- NA
data13$USborn <- NA
data13$USborn[data13$imqn4 %in% c(1,3)] <- 0 
data13$USborn[data13$imqn4 == 2] <- 1 # does not include born in PR
data13$USborn_parents <- NA
# data13$USborn_parents[data13$USborn == 0 | (data13$Q410 %in% c(1, 3) & data13$Q411 %in% c(1, 3))] <- 0 # does not include born in PR
# data13$USborn_parents[(data13$Q410 == 2) | (data13$Q411 == 2)] <- 1
# data13$USborn_parents[(data13$Q410 == 2) & (data13$Q411 == 2)] <- 2
data13$USborn_gparents <- NA
# data13$USborn_gparents[data13$USborn == 0 | data13$USborn_parents == 0] <- 0

## Above variables are not used in analyses


data13$age <- as.numeric(data13$age)
data13$age[data13$age == 1] <- 21
data13$age[data13$age == 2] <- 27
data13$age[data13$age == 3] <- 32
data13$age[data13$age == 4] <- 37
data13$age[data13$age == 5] <- 42
data13$age[data13$age == 6] <- 47
data13$age[data13$age == 7] <- 52
data13$age[data13$age == 8] <- 57
data13$age[data13$age == 9] <- 62
data13$age[data13$age == 10] <- 67
data13$age[data13$age == 11] <- 72
data13$age[data13$age == 12] <- 77
data13$age[data13$age == 13] <- 82
data13$age[data13$age == 14] <- 87
data13$age[data13$age == 15] <- 90 # 90 and over
data13$age[data13$age == 99] <- 99
data13$age[data13$age == 1] <- 24
data13$age[data13$age == 2] <- 40
data13$age[data13$age == 3] <- 57
data13$age[data13$age %in% c(99, 4, 9)] <- NA

data13$Mexican <- NA


## PID

diag_df <- data13 %>%
  group_by(party, partyln) %>%
  reframe(count=n())

data13 <- data13 %>%
  mutate(party_clean = case_when(
    (party == 2 | partyln == 2) ~ "Democrat",
    (party == 1 | partyln == 1) ~ "Republican",
    (party == 3 | partyln ==3) ~ "Independent",
    TRUE ~ "Other")
    )

diag_df <- data13 %>%
  group_by(party_clean, party, partyln ) %>%
  reframe(count=n())

data13$party_old <- data13$party
data13$Party <- data13$party_clean
data13$Party7 <- data13$party_clean

diag_df <- data13 %>%
  group_by(Party, Party7, party, partyln ) %>%
  reframe(count=n())


data13$Year <- 2013
data13$Survey <- "Pew"
data13$id <- data13$ID
data13$Weight <- data13$weight

## Region

diag_df <- data13 %>%
  group_by(samp20) %>%
  reframe(count=n())

# 1 Northeast
# 2 North Central
# 3 South
# 4 West

data13 <- data13 %>%
  mutate(region = case_when(
    samp20 == "1" ~ "Northeast",
    samp20 == "2" ~ "North Central",
    samp20 == "3" ~ "South",
    samp20 == "4" ~ "West",
    TRUE ~ NA
  ))


diag_df <- data13 %>%
  group_by(samp20,
           region) %>%
  reframe(count=n())


data13$date <- 62613 # May 24 - Jul 28, 2013. This is the midpoint.

data13$education <- NA
data13$education[data13$imeduc %in% c(1,2)] <- "NoHS"
data13$education[data13$imeduc %in% c(3)] <- "HSOnly"
data13$education[data13$imeduc %in% c(6,7,8)] <- "CollGrad"
data13$education[data13$imeduc %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders


## Origin

data13$origin <- "Other"
data13$origin[data13$imqn3 == 1] <- "Mexican"
data13$origin[data13$imqn3 == 2 | data13$imqn5 == 4] <- "Puerto Rican"
data13$origin[data13$imqn3 == 3] <- "Cuban"
data13$origin[data13$imqn3 == 4] <- "Dominican"
data13$origin[data13$imqn3 == 5] <- "Salvadoran"
# data13$origin[data13$imqn3 == 6] <- "Other Central American"
# data13$origin[data13$imqn3 == 7] <- "Other South American"

## Add empty variables

names(df_collected)

data13$Weight_AllLatinos <- NA
data13$birthyear <- NA

data13 <- subset(as.data.frame(data13), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data13)

```


```{r Pew 2014}

data14 <- pew_14

names(data14)

## Citizen
diag_df <- data14 %>%
  group_by(q9) %>%
  reframe(count=n())

data14 <- data14 %>%
  mutate(citizen = if_else(is.na(q9) == TRUE | q9 == 1,
                           1,
                           0))

diag_df <- data14 %>%
  group_by(citizen, q9) %>%
  reframe(count=n())

data14$sex <- ""
data14$sex[data14$gender == 1] <- "M"
data14$sex[data14$gender == 2] <- "F"
data14$state <- NA
data14$USborn <- NA
data14$USborn[data14$q4 %in% c(1,3)] <- 0 
data14$USborn[data14$q4 == 2] <- 1 # does not include born in PR
data14$USborn_parents <- NA
data14$USborn_parents[data14$USborn == 0 | (data14$q7 %in% c(1, 3) & data14$q8 %in% c(1, 3))] <- 0 # does not include born in PR
data14$USborn_parents[(data14$q7 == 2) | (data14$q8 == 2)] <- 1
data14$USborn_parents[(data14$q7 == 2) & (data14$q8 == 2)] <- 2
data14$USborn_gparents <- NA
data14$USborn_gparents[data14$USborn == 0 | data14$USborn_parents == 0] <- 0

data14$age <- data14$age
data14$age[data14$age == 99] <- 99
data14$age[data14$age == 1] <- 24
data14$age[data14$age == 2] <- 40
data14$age[data14$age == 3] <- 57
data14$age[data14$age %in% c(99, 4, 9)] <- NA

data14$Mexican <- 0
data14$Mexican[data14$q3 == 1] <- 1

diag_df <- data14 %>%
  group_by(party, partyln) %>%
  reframe(count=n())

data14$Party <- NA
data14$Party[data14$party == 2 | data14$partyln == 2] <- "Democrat"
data14$Party[data14$party == 1 | data14$partyln == 1] <- "Republican"
data14$Party[(data14$party == 3 & is.na(data14$Party))] <- "Independent"
data14$Party[(data14$party %in% c(4,5,7,8,9) & is.na(data14$Party))] <- "Other"
data14$Party7 <- data14$Party
data14$Party7[data14$partyln == 1] <- "Lean Republican"
data14$Party7[data14$partyln == 2] <- "Lean Democrat"
data14$Party7[data14$partyln == 3] <- "Independent"
data14$Party7[data14$partyln %in% c(4,5,7,8,9)] <- "Other"

diag_df <- data14 %>%
  group_by(Party, Party7, party, partyln) %>%
  reframe(count=n())

data14$Year <- 2014
data14$Survey <- "Pew"
data14$id <- data14$caseid
data14$Weight <- data14$weight

data14 <- data14 %>%
  mutate(region = case_when(
    sample_region == 1 ~ "Northeast",
    sample_region == 2 ~ "North Central",
    sample_region == 3 ~ "South",
    sample_region == 4 ~ "West",
    TRUE ~ NA
  ))

data14$date <- 92514 # Sep 11 - Oct 9, 2014. This is the midpoint.

data14$education <- NA
data14$education[data14$educ %in% c(1,2)] <- "NoHS"
data14$education[data14$educ %in% c(3)] <- "HSOnly"
data14$education[data14$educ %in% c(6,7,8)] <- "CollGrad"
data14$education[data14$educ %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

data14$origin <- "Other"
data14$origin[data14$q3 == 1] <- "Mexican"
data14$origin[data14$q3 == 2 | data14$q4 == 1] <- "Puerto Rican"
data14$origin[data14$q3 == 3] <- "Cuban"
data14$origin[data14$q3 == 4] <- "Dominican"
data14$origin[data14$q3 == 3] <- "Salvadoran"

data14$Weight_AllLatinos <- NA
data14$birthyear <- NA

data14 <- subset(as.data.frame(data14), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

names(data14) == names(df_collected)

df_collected <- rbind(df_collected,
                      data14)

```


```{r Pew 2015}

data15 <- pew_15

names(data15)

## Citizen
diag_df <- data15 %>%
  group_by(q9) %>%
  reframe(count=n())

data15 <- data15 %>%
  mutate(citizen = if_else(is.na(q9) == TRUE | q9 == 1,
                           1,
                           0))

data15$sex <- as.character(data15$sex)
data15$sex[data15$sex == "1"] <- "M"
data15$sex[data15$sex == "2"] <- "F"
data15$state <- NA
data15$USborn <- NA
data15$USborn[data15$q4 %in% c(1,3)] <- 0 
data15$USborn[data15$q4 == 2] <- 1 # does not include born in PR
data15$USborn_parents <- NA
data15$USborn_parents[data15$USborn == 0 | (data15$q7 %in% c(1, 3) & data15$q8 %in% c(1, 3))] <- 0 # does not include born in PR
data15$USborn_parents[(data15$q7 == 2) | (data15$q8 == 2)] <- 1
data15$USborn_parents[(data15$q7 == 2) & (data15$q8 == 2)] <- 2
data15$USborn_gparents <- NA
data15$USborn_gparents[data15$USborn == 0 | data15$USborn_parents == 0] <- 0
data15$fpat <- 0
data15$fpat[data15$q8ab == 2] <- 1
data15$fmat <- 0
data15$fmat[data15$q8aa == 2] <- 1
data15$mmat <- 0
data15$mmat[data15$q8ba == 2] <- 1
data15$mpat <- 0
data15$mpat[data15$q8bb == 2] <- 1
data15$gparents <- data15$fpat + data15$fmat + data15$mmat + data15$mpat
data15$USborn_gparents[is.na(data15$USborn_gparents)] <- data15$gparents[is.na(data15$USborn_gparents)]

data15$age <- data15$age
data15$age[data15$age == 99] <- 99
data15$age[data15$age == 1] <- 24
data15$age[data15$age == 2] <- 40
data15$age[data15$age == 3] <- 57
data15$age[data15$age %in% c(99, 4, 9)] <- NA

data15$Mexican <- 0
data15$Mexican[data15$q3_combo == 1] <- 1

diag_df <- data15 %>%
  group_by(party, partyln) %>%
  reframe(count=n())

data15$Party <- NA
data15$Party[data15$party == 2 | data15$partyln == 2] <- "Democrat"
data15$Party[data15$party == 1 | data15$partyln == 1] <- "Republican"
data15$Party[(data15$party %in% c(3,4) & is.na(data15$Party))] <- "Independent"
data15$Party[(data15$party %in% c(5,7,8,9) & is.na(data15$Party))] <- "Other"
data15$Party7 <- data15$Party
data15$Party7[data15$partyln == 1] <- "Lean Republican"
data15$Party7[data15$partyln == 2] <- "Lean Democrat"
data15$Party7[data15$partyln == 3] <- "Independent"
data15$Party7[data15$partyln %in% c(4,5,7,8,9)] <- "Other"

diag2_df <- data15 %>%
  group_by(Party, Party7, party, partyln) %>%
  reframe(count=n())


data15$Year <- 2015
data15$Survey <- "Pew"
data15$id <- data15$caseid
data15$Weight <- data15$weights
data15$region <- data15$sample_region

data15$date <- 111015 # Oct 21 - Nov 30, 2015. This is the midpoint.

data15$education <- NA
data15$education[data15$educ %in% c(1,2)] <- "NoHS"
data15$education[data15$educ %in% c(3)] <- "HSOnly"
data15$education[data15$educ %in% c(6,7,8)] <- "CollGrad"
data15$education[data15$educ %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

data15$origin <- "Other"
data15$origin[data15$q3_combo == 1] <- "Mexican"
data15$origin[data15$q3_combo == 2 | data15$q4 == 1] <- "Puerto Rican"
data15$origin[data15$q3_combo == 3] <- "Cuban"
data15$origin[data15$q3_combo == 4] <- "Dominican"
data15$origin[data15$q3_combo == 5] <- "Salvadoran"


## Add empty variables

names(df_collected)

data15$Weight_AllLatinos <- NA
data15$birthyear <- NA

data15 <- subset(as.data.frame(data15), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data15)


```



```{r Pew 2016}

data16 <- pew_16

names(data16)

## Citizen
diag_df <- data16 %>%
  group_by(qn9) %>%
  reframe(count=n())

data16 <- data16 %>%
  mutate(citizen = if_else(is.na(qn9) == TRUE | qn9 == 1,
                           1,
                           0)
  )

data16$sex <- as.character(data16$sex)
data16$sex[data16$sex == 1] <- "M"
data16$sex[data16$sex == 2] <- "F"
data16$state <- NA
data16$USborn <- NA
data16$USborn[data16$qn4 %in% c(1,3)] <- 0 
data16$USborn[data16$qn4 == 2] <- 1 # does not include born in PR
data16$USborn_parents <- NA
data16$USborn_parents[data16$USborn == 0 | (data16$qn7 %in% c(1, 3) & data16$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data16$USborn_parents[(data16$qn7 == 2) | (data16$qn8 == 2)] <- 1
data16$USborn_parents[(data16$qn7 == 2) & (data16$qn8 == 2)] <- 2
data16$USborn_gparents <- NA
data16$USborn_gparents[data16$USborn == 0 | data16$USborn_parents == 0] <- 0

data16$age <- data16$age
data16$age[data16$age == 99] <- 99
data16$age[data16$age == 1] <- 27
data16$age[data16$age == 2] <- 43
data16$age[data16$age == 3] <- 61
data16$age[data16$age %in% c(99, 4, 9)] <- NA

data16$Mexican <- 0
data16$Mexican[data16$qn3 == 1] <- 1


diag_df <- data16 %>%
  group_by(party, partyln) %>%
  reframe(count=n())

data16$Party <- NA
data16$Party[data16$party == 2 | data16$partyln == 2] <- "Democrat"
data16$Party[data16$party == 1 | data16$partyln == 1] <- "Republican"
data16$Party[(data16$party %in%c(3,4) & is.na(data16$Party))] <- "Independent"
data16$Party[(data16$party %in% c(5,6,7,8,9) & is.na(data16$Party))] <- "Other"
data16$Party7 <- data16$Party
data16$Party7[data16$partyln == 1] <- "Lean Republican"
data16$Party7[data16$partyln == 2] <- "Lean Democrat"
data16$Party7[data16$partyln == 3] <- "Independent"
data16$Party7[data16$partyln %in% c(8,9)] <- "Other"

diag2_df <- data16 %>%
  group_by(Party, Party7, party, partyln) %>%
  reframe(count=n())

data16$Year <- 2016
data16$Survey <- "Pew"
data16$id <- data16$case_id
data16$Weight <- data16$weights
data16$region <- NA

data16$date <- 90716 # Aug 23 - Sep 21, 2016. This is the midpoint.

data16$education <- NA
data16$education[data16$educ %in% c(1,2)] <- "NoHS"
data16$education[data16$educ %in% c(3)] <- "HSOnly"
data16$education[data16$educ %in% c(6,7,8)] <- "CollGrad"
data16$education[data16$educ %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

data16$origin <- "Other"
data16$origin[data16$qn3 == 1] <- "Mexican"
data16$origin[data16$qn3 == 2 | data16$qn4 == 1] <- "Puerto Rican"
data16$origin[data16$qn3 == 3] <- "Cuban"
data16$origin[data16$qn3 == 4] <- "Dominican"
data16$origin[data16$qn3 == 5] <- "Salvadoran"
data16$origin[data16$qn3 == 6] <- "Spanish"


## Add empty variables

names(df_collected)

data16$Weight_AllLatinos <- NA
data16$birthyear <- NA

data16 <- subset(as.data.frame(data16), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data16)

```


```{r Pew 2017}

data17 <- pew_17

names(pew_17)

## No citizenship Q

diag_df <- pew_17 %>%
  group_by(nativity,
           nativity_status) %>%
  reframe(count=n())

## Doesn't include national origin
## Instrument is quite different than other years

## Don't include in final dataset

```


```{r Pew 2018}

data18 <- pew_18

names(data18)

## Citizen
diag_df <- data18 %>%
  group_by(qn9) %>%
  reframe(count=n())

data18 <- data18 %>%
  mutate(citizen = if_else(is.na(qn9) == TRUE | qn9 == 1,
                           1,
                           0)
  )

data18$sex <- as.character(data18$sex)
data18$sex[data18$sex == 1] <- "M"
data18$sex[data18$sex == 2] <- "F"

data18$state <- NA
data18$USborn <- NA
data18$USborn[data18$qn4 %in% c(1,3)] <- 0 
data18$USborn[data18$qn4 == 2] <- 1 # does not include born in PR
data18$USborn_parents <- NA
data18$USborn_parents[data18$USborn == 0 | (data18$qn7 %in% c(1, 3) & data18$qn8 %in% c(1, 3))] <- 0 # does not include born in PR
data18$USborn_parents[(data18$qn7 == 2) | (data18$qn8 == 2)] <- 1
data18$USborn_parents[(data18$qn7 == 2) & (data18$qn8 == 2)] <- 2
data18$USborn_gparents <- NA
data18$USborn_gparents[data18$USborn == 0 | data18$USborn_parents == 0] <- 0

data18$age <- data18$age
data18$age[data18$age == 99] <- 99
data18$age[data18$age == 1] <- 24
data18$age[data18$age == 2] <- 40
data18$age[data18$age == 3] <- 57
data18$age[data18$age %in% c(99, 4, 9)] <- NA

data18$Mexican <- 0
data18$Mexican[data18$qn3 == 1] <- 1

data18$Party <- NA
data18$Party[data18$party == 2 | data18$partyln == 2] <- "Democrat"
data18$Party[data18$party == 1 | data18$partyln == 1] <- "Republican"
data18$Party[(data18$party %in% c(3,4) & is.na(data18$Party))] <- "Independent"
data18$Party[(data18$party %in% c(5,6,7,8,9) & is.na(data18$Party))] <- "Other"
data18$Party7 <- data18$Party
data18$Party7[data18$partyln == 1] <- "Lean Republican"
data18$Party7[data18$partyln == 2] <- "Lean Democrat"
data18$Party7[data18$partyln == 3] <- "Independent"
data18$Party7[data18$partyln %in% c(5,6,7,8,9)] <- "Other"

diag2_df <- data18 %>%
  group_by(Party, Party7, party, partyln) %>%
  reframe(count=n())


data18$Year <- 2018
data18$Survey <- "Pew"
data18$id <- data18$case_id
data18$Weight <- data18$weight
data18$region <- NA

data18$date <- 81718 # July 26 - September 9, 2018. This is the midpoint.

data18$education <- NA
data18$education[data18$educ %in% c(1,2)] <- "NoHS"
data18$education[data18$educ %in% c(3)] <- "HSOnly"
data18$education[data18$educ %in% c(6,7,8)] <- "CollGrad"
data18$education[data18$educ %in% c(4,5)] <- "SomeColl_Other" # includes non-4 year degree holders

data18$origin <- "Other"
data18$origin[data18$qn3 == 1] <- "Mexican"
data18$origin[data18$qn3 == 2 | data18$qn4 == 1] <- "Puerto Rican"
data18$origin[data18$qn3 == 3] <- "Cuban"
data18$origin[data18$qn3 == 4] <- "Dominican"
data18$origin[data18$qn3 == 5] <- "Salvadoran"
data18$origin[data18$qn3 == 6] <- "Spanish"

## Add empty variables

names(df_collected)

data18$Weight_AllLatinos <- NA
data18$birthyear <- NA

data18 <- subset(as.data.frame(data18), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      data18)


```
## LNS

```{r LNS 2006}

LNS <- lns_06

names(LNS)


## Restrict to citizens

diag_df <- LNS %>%
  group_by(BORNUS,
           NATUSCIT) %>%
  reframe(count=n())

LNS <- LNS %>%
  mutate(citizen = ifelse(BORNUS %in% c("(1) Mainland US", "(2) Puerto Rico") | NATUSCIT == "(1) YES", 1, 0))


LNS$sex <- ""
LNS$sex[LNS$SEX == "(1) MALE"] <- "M"
LNS$sex[LNS$SEX == "(2) FEMALE"] <- "F"

#Coding for Mexican Yes=1 No=0
LNS$Mexican <-0
LNS$Mexican[LNS$ANCESTRY=="(12) Mexico"] <-1
LNS$Mexican[LNS$ANCESTRY2=="(23) Mexico"] <-1
#Coding for US Born
LNS$USborn <-0
LNS$USborn[LNS$BORNUS=="(1) Mainland US"] <-1 #Doesn't include Puerto Rico ((2) Puerto Rico)- 6 Mex
#Coding for Generation (at least 1 Grandparent born outside U.S.)
LNS$USborn_parents <- LNS$PARBORN
LNS$USborn_parents[LNS$USborn_parents == "(1) One parent born in the U.S."] <- 1
LNS$USborn_parents[LNS$USborn_parents == "(2) Both parents born in the U.S."] <- 2
LNS$USborn_parents[LNS$USborn_parents == "(3) Neither parent born in the U.S." | LNS$USborn == 0] <- 0
LNS$USborn_parents[LNS$USborn_parents %in% c("(4) Don't Know","(5) Refused")] <- NA

#LNS question asks how many grandparents born outside U.S.; how many parents?
LNS$USborn_gparents <- NA
LNS$USborn_gparents[LNS$GRANBORN == "(0) None"] <- 4
LNS$USborn_gparents[LNS$GRANBORN == "(1) One"] <- 3
LNS$USborn_gparents[LNS$GRANBORN == "(2) Two"] <- 2
LNS$USborn_gparents[LNS$GRANBORN == "(3) Three"] <- 1
LNS$USborn_gparents[LNS$GRANBORN == "(4) All" | LNS$USborn_parents == 0] <- 0

LNS$age <- LNS$AGE


diag_df <- LNS %>%
  group_by(PARTYID, INDPARTY, CLOPART) %>%
  reframe(count=n())

LNS$Party <-NA
LNS$Party[LNS$PARTYID=="(1) Democrat" | LNS$INDPARTY=="(2) Democrat" | LNS$CLOPART=="(2) Closer to Democrat"] <- "Democrat"
LNS$Party[LNS$PARTYID== "(2) Republican" | LNS$INDPARTY =="(1) Republican" | LNS$CLOPART=="(1) Closer to Republican"]<- "Republican"
LNS$Party[LNS$PARTYID %in% c("(3) Independent", "(4) Don't care") & is.na(LNS$Party)]<- "Independent"
LNS$Party[LNS$PARTYID %in% c("(5) Don't know/other party") & is.na(LNS$Party)]<- "Other"

LNS$Party7 <- LNS$Party
LNS$Party7[LNS$STRDPARTN == "(1) Strong ANSWERFROM(QJ1)" & LNS$Party == "Democrat"] <- "Strong Democrat" 
LNS$Party7[LNS$STRDPARTN == "(1) Strong ANSWERFROM(QJ1)" & LNS$Party == "Republican"] <- "Strong Republican"
LNS$Party7[LNS$STRDPARTN == "(2) Not very strong ANSWERFROM(QJ1)" & LNS$Party == "Democrat"] <- "Weak Democrat" 
LNS$Party7[LNS$STRDPARTN == "(2) Not very strong ANSWERFROM(QJ1)" & LNS$Party == "Republican"] <- "Weak Republican"
LNS$Party7[LNS$INDPARTY == "(1) Republican" | LNS$CLOPART == "(1) Closer to Republican"] <- "Lean Republican"
LNS$Party7[LNS$INDPARTY == "(2) Democrat" | LNS$CLOPART == "(2) Closer to Democrat"] <- "Lean Democrat"


diag_df <- LNS %>%
  group_by(Party, Party7, PARTYID, INDPARTY, CLOPART) %>%
  reframe(count=n())

LNS$state <- LNS$RSTATE
LNS$region <- ""

LNS$Year <- 2006
LNS$Survey <- "LNS"
LNS$id <- LNS$RESPID
LNS$Weight <- LNS$WT_NATION_REV
LNS$Weight_AllLatinos <- NA

LNS$date <- LNS$DOI
LNS$birthyear <- LNS$BIRDATE

LNS$education <- NA
LNS$education[LNS$REDUC %in% c("(0) None", "(1) Eighth grade or below", "(2) Some high school")] <- "NoHS"
LNS$education[LNS$REDUC %in% c("(3) GED", "(4) High school graduate")] <- "HSOnly" # includes GED
LNS$education[LNS$REDUC %in% c("(5) Some college")] <- "SomeColl_Other"
LNS$education[LNS$REDUC %in% c("(6) 4 year college degree", "(7) Graduate or professional degree")] <- "CollGrad"

LNS$origin <- "Other"
LNS$origin[LNS$ETHNIC == "(2) Mexican"] <- "Mexican"
LNS$origin[LNS$ETHNIC == "(3) Mexican"] <- "Cuban"
LNS$origin[LNS$ETHNIC == "(5) Puerto Rican"] <- "Puerto Rican"
LNS$origin[LNS$ETHNIC == "(4) Dominican"] <- "Dominican"
LNS$origin[LNS$ETHNIC == "(6) Salvadoran"] <- "Salvadoran"


LNS <- subset(LNS, select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      LNS)

```


## CES

```{r CES 2006-2016 and 2018}

df <- ces_full

## Checking years

diag_df <- df %>%
  group_by(year) %>%
  reframe(count=n())

## Only use data through 2018; 2017/2019-2023 are separate code

df <- df %>%
  filter(year %in% c(2006:2016, 2018))

#  [1] "Year"              "Survey"            "id"               
#  [4] "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"           
# [10] "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"
# [16] "education"         "origin"            "Weight_AllLatinos"
# [16] "region"            "generation" 

## Subset to Hispanics
df <- subset(df, race == 3 | hispanic == 1)


names(df)


## Check citizen

diag_df <- ces_full %>%
  group_by(citizen) %>%
  reframe(count=n())

df <- df %>%
  mutate(citizen = if_else(citizen == 2, 
                           0,
                           citizen))


diag_df <- df %>%
  group_by(pid3, pid3_leaner, pid7) %>%
  reframe(count=n())


df <- df %>%
  mutate(
    Party = case_when(
      pid3_leaner == 1 | pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3_leaner == 2 | pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3_leaner == 3 | pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"
    )
  )

# df$Party <- NA
# df$Party[df$pid3_leaner == 1 | df$pid3 == 1] <- "Democrat"
# df$Party[df$pid7 %in% c(1,2,3)] <- "Democrat"
# df$Party[df$pid3_leaner == 2 | df$pid3 == 2] <- "Republican"
# df$Party[df$pid7 %in% c(5,6,7)] <- "Republican"
# df$Party[df$pid3_leaner == 3 | df$pid7 == 4 | df$pid3 == 3] <- "Independent"


df$Party7 <- df$Party
# df$Party7[df$pid7 == 1] <- "Strong Democrat"
# df$Party7[df$pid7 == 2] <- "Weak Democrat"
# df$Party7[df$pid7 == 3] <- "Lean Democrat"
# df$Party7[df$pid7 == 4] <- "Independent"
# df$Party7[df$pid7 == 5] <- "Lean Republican"
# df$Party7[df$pid7 == 6] <- "Weak Republican"
# df$Party7[df$pid7 == 7] <- "Strong Republican"
# df$Party7[df$pid7 %in% c(8,9) | is.na(df$pid7) == TRUE] <- "Other"


diag2_df <- df %>%
  group_by(Party,
    pid3_leaner,
           pid3,
           pid7) %>%
  reframe(count=n())

df$sex_old <- df$sex

df <- df %>%
  mutate(sex = case_when(
    sex_old == 1 ~ "M",
    sex_old == 2 ~ "F",
    TRUE ~ NA
  ))

diag_df <- df %>%
  group_by(sex,
           sex_old) %>%
  reframe(count=n())

df$state <- df$st

diag_df <- df %>%
  group_by(state) %>%
  reframe(count=n())

df$region <- NA

df$Year <- df$year
df$Survey <- "CES"
df$id <- df$case_id
df$Weight <- df$weight
df$Weight_AllLatinos <- NA

df$birthyear <- df$birthyr
df$date <- NA

df$education <- NA
df$education[df$educ == 1] <- "NoHS"
df$education[df$educ == 2] <- "HSOnly"
df$education[df$educ %in% c(3,4)] <- "SomeColl_Other" # 2-year college grad is in here
df$education[df$educ %in% c(5,6)] <- "CollGrad"

## USBorn?

diag_df <- df %>%
  group_by(Year, USborn) %>%
  reframe(count=n())

## Need to add manually

## Origin?

diag_df <- df %>%
  group_by(Year, hisp_origin) %>%
  reframe(count=n())

## Not in 2006-2014
## Appears in 2015, 2016

## Assigned in descending order, from highest proportion
## Each respondent takes on only one national origin ID
## Higher proportion identities supercede lower proportion
## 1. Mexican
## 2. Puerto Rican
## 3. Cuban
## 4. Dominican
## 5. Salvadoran
## Remainder: Other

df <- df %>%
  mutate(origin = case_when(
    grepl("Mexico", hisp_origin) ~ "Mexican",
    grepl("Puerto Rico", hisp_origin) ~ "Puerto Rican",
    grepl("Cuba", hisp_origin) ~ "Cuban",
    grepl("Dominican Republic", hisp_origin) ~ "Dominican",
    grepl("El Salvador", hisp_origin) ~ "Salvadoran",
    is.na(hisp_origin)==TRUE ~ NA,
    TRUE ~ "Other"
  ))

diag_df <- df %>%
  group_by(year, hisp_origin, origin) %>%
  reframe(count=n())


df <- subset(data.frame(df), select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

names(df_collected) == names(df)

df_collected <- rbind(df,
                      df_collected)

```


```{r CES 2017}

## 2017

## Easier going year-by-year (cumulative misses some variables)

ces_17_merging <- ces_17

## Variables to get

names(ces_17_merging)

#  [1] "Year"              "Survey"            "id"               
#  [4] "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"           
# [10] "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"
# [16] "education"         "origin"            "Weight_AllLatinos"
# [16] "region"            "generation" 


## Restrict to only Latino/Hispanic

diag_df <- ces_17_merging %>%
  group_by(race, hispanic) %>%
  reframe(count=n())

ces_17_merging <- ces_17_merging %>%
  filter(race == 3 | hispanic == 1)

## Make citizen

ces_17_merging <- ces_17_merging %>%
  mutate(citizen = if_else(cit1 == 1,
                           1,
                           0)
  )

## 1. Year: make new

# diag_df <- ces_17_merging %>%
#   group_by(year) %>%
#   reframe(count=n())

ces_17_merging$Year <- 2017

## 2. Survey: new

ces_17_merging <- ces_17_merging %>%
  mutate(Survey = "CES")

## 3. id = V101

diag_df <- ces_17_merging %>%
  group_by(V101) %>%
  reframe(count=n())

ces_17_merging$id <- ces_17_merging$V101

## 4. Weight = weight

ces_17_merging <- ces_17_merging %>%
  dplyr::rename(Weight = weights_common)

## 5. state = inputstate (need to clean still)

diag_df <- tabyl(df1, state)
diag_df <- tabyl(ces_17_merging, inputstate)

state_list <- tidycensus::fips_codes %>%
  select(state,
         state_code) %>%
  mutate(state_code = as.numeric(state_code)) %>%
  distinct()

ces_17_merging <- left_join(ces_17_merging,
                            state_list,
                            by = c("inputstate" = "state_code"))


diag_df <- ces_17_merging %>%
  group_by(inputstate,
           state) %>%
  reframe(count = n())

## 6. sex = gender

diag_df <- tabyl(df1, sex)
diag_df <- tabyl(ces_17_merging, gender)


ces_17_merging <- ces_17_merging %>%
  mutate(sex = case_when(
    gender == 1 ~ "M",
    gender == 2 ~ "F",
    TRUE ~ NA
  ))

diag_df <- ces_17_merging %>%
  group_by(sex, gender) %>%
  reframe(count=n())

## 7. age: make with birthyear

diag_df <- tabyl(df1, age)
diag_df <- tabyl(ces_17_merging, birthyr)

ces_17_merging <- ces_17_merging %>%
  mutate(age = 2017 - birthyr)

diag_df <- ces_17_merging %>%
  group_by(age,
           birthyr) %>%
  reframe(count = n())

## 8. Mexican: make with national origin variable (?)

diag_df <- tabyl(df1, Mexican)

## Don't need, just make empty variable

ces_17_merging$Mexican <- NA

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_1,
#            CC20_hisp_2,
#            CC20_hisp_3,
#            CC20_hisp_4) %>%
#   reframe(count=n())

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_2) %>%
#   reframe(count=n())

## hisp_2 == US origin

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_3) %>%
#   reframe(count=n())

## hisp_3 == Mexican origin

# ces_20_merging <- ces_20_merging %>%
#   mutate(Mexican = case_when(
#     CC20_hisp_3 == 1 ~ 1,
#     CC20_hisp_3 == 2 ~ 0,
#     TRUE ~ NA
#   ))

# diag_df <- ces_20_merging %>%
#   group_by(Mexican) %>%
#   reframe(count=n())

## 9. USborn: make
## 10. USborn_parents: make
## 11. USborn_gparents: make

diag_df <- ces_17_merging %>%
  group_by(immstat,
           cit1) %>%
  reframe(count=n())


ces_17_merging <- ces_17_merging %>%
  mutate(USborn = ifelse(immstat ==1,
                         0,
                         1)
  )

ces_17_merging$USborn_parents <- NA
ces_17_merging$USborn_gparents <- NA

         
## Don't need anything but USborn

diag_df <- ces_17_merging %>%
  group_by(immstat,
           USborn,
           USborn_parents,
           USborn_gparents) %>%
  reframe(count=n())

## 12. Party = pid3_leaner
## 13. Party7 = pid7

diag_df <- ces_17_merging %>%
  group_by(pid3, pid7) %>%
  reframe(count=n())


ces_17_merging <- ces_17_merging %>%
  mutate(
    Party = case_when(
      pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      pid7 == 1 ~ "Strong Democrat",
      pid7 == 2 ~ "Weak Democrat",
      pid7 == 3 ~ "Lean Democrat",
      pid7 == 4 ~ "Independent",
      pid7 == 5 ~ "Lean Republican",
      pid7 == 6 ~ "Weak Republican",
      pid7 == 7 ~ "Strong Republican",
      TRUE ~ "Other"
    )
  )
    


diag_df <- ces_17_merging %>%
  group_by(Party,
           pid3,
           pid7) %>%
  reframe(count=n())
  

## 14. date. Leave NA for now

# diag_df <- ces_17 %>%
#   group_by(ccesmodule) %>%
#   reframe(count=n())

ces_17_merging$date <- "NA"


## 15. birthyear = birthyr

ces_17_merging <- ces_17_merging %>%
  dplyr::rename(birthyear = birthyr)


## 16. education

diag_df <- ces_17_merging %>%
  group_by(educ) %>%
  reframe(count=n())

diag_df <- df1 %>%
  group_by(education) %>%
  reframe(count=n())

ces_17_merging <- ces_17_merging %>%
  mutate(education = case_when(
    educ == 1 ~ "NoHs",
    educ == 2 ~ "HSOnly",
    educ == 3 ~ "SomeColl_Other",
    educ %in% c(4,5,6) ~ "CollGrad",
    TRUE ~ NA
  ))

diag_df <- ces_17_merging %>%
  group_by(education,
           educ) %>%
  reframe(count=n())



## 17. origin

diag_df <- df1 %>%
  group_by(origin,
           Survey,
           Year) %>%
  reframe(count=n())

diag_df <- ces_17 %>%
  group_by(CC17_353a_1,
             CC17_353a_2,
             CC17_353a_3,
             CC17_353a_4,
             CC17_353a_5,
             CC17_353a_6,
             CC17_353a_7,
             CC17_353a_8,
           CC17_353a_9,
           CC17_353a_10,
           CC17_353a_11) %>%
  reframe(count=n())

## a_3 = Mexican
## a_4 = Puerto Rican
## a_5 = Cuban
## a_6 = Dominican

ces_17_merging <- ces_17_merging %>%
  mutate(origin = case_when(
    CC17_353a_3 == 1 ~ "Mexican",
    CC17_353a_4 == 1 ~ "Puerto Rican",
    CC17_353a_5 == 1 ~ "Cuban",
    CC17_353a_6 == 1 ~ "Dominican",
    TRUE ~ "Other"
  ))

diag_df <- ces_17_merging %>%
  group_by(origin) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

ces_17_merging <- ces_17_merging %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA

ces_17_merging <- ces_17_merging %>%
  mutate(region = NA)


## 20: generation

ces_17_merging <- ces_17_merging %>%
    mutate(generation = case_when(
    birthyear < 1928 ~ "Greatest",
    birthyear > 1927 & 
      birthyear < 1946 ~ "Silent",
    birthyear > 1945 & 
      birthyear < 1965 ~ "Boomer",
    birthyear > 1964 & 
      birthyear < 1981 ~ "GenX",
    birthyear > 1980 & 
      birthyear < 1997 ~ "Millennial",
    birthyear > 1996 & 
      birthyear < 2012 ~ "GenZ",
    TRUE ~ "Missing")
  )

diag_df <- ces_17_merging %>%
  group_by(generation,
           birthyear) %>%
  reframe(count=n())



## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [19] "region"            "generation"

ces_17_merging <- ces_17_merging %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df_collected) == names(ces_17_merging)

df_collected <- rbind(df_collected,
                      ces_17_merging)

diag_df <- df_collected %>%
  group_by(Year,
           Survey) %>%
  reframe(count=n())


```


```{r CES 2019}

## 2019

## Easier going year-by-year (cumulative misses some variables)

ces_19_merging <- ces_19

## Variables to get

names(ces_19_merging)

#  [1] "Year"              "Survey"            "id"               
#  [4] "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"           
# [10] "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"
# [16] "education"         "origin"            "Weight_AllLatinos"
# [16] "region"            "generation" 


## Restrict to only Latino/Hispanic

diag_df <- ces_19_merging %>%
  group_by(race, hispanic) %>%
  reframe(count=n())

ces_19_merging <- ces_19_merging %>%
  filter(race == 3 | hispanic == 1)

## Make citizens

ces_19_merging <- ces_19_merging %>%
  mutate(citizen = if_else(cit1 == 1,
                           1,
                           0)
  )

## 1. Year: make new

# diag_df <- ces_19_merging %>%
#   group_by(year) %>%
#   reframe(count=n())

ces_19_merging$Year <- 2019

## 2. Survey: new

ces_19_merging <- ces_19_merging %>%
  mutate(Survey = "CES")

## 3. id = V101

diag_df <- ces_19_merging %>%
  group_by(caseid) %>%
  reframe(count=n())

ces_19_merging$id <- ces_19_merging$caseid

## 4. Weight = weight

ces_19_merging <- ces_19_merging %>%
  dplyr::rename(Weight = commonweight)

## 5. state = inputstate (need to clean still)

diag_df <- tabyl(df1, state)
diag_df <- tabyl(ces_19_merging, inputstate)

state_list <- tidycensus::fips_codes %>%
  select(state,
         state_code) %>%
  mutate(state_code = as.numeric(state_code)) %>%
  distinct()

ces_19_merging <- left_join(ces_19_merging,
                            state_list,
                            by = c("inputstate" = "state_code"))


diag_df <- ces_19_merging %>%
  group_by(inputstate,
           state) %>%
  reframe(count = n())

## 6. sex = gender

diag_df <- tabyl(df1, sex)
diag_df <- tabyl(ces_19_merging, gender)


ces_19_merging <- ces_19_merging %>%
  mutate(sex = case_when(
    gender == 1 ~ "M",
    gender == 2 ~ "F",
    TRUE ~ NA
  ))

diag_df <- ces_19_merging %>%
  group_by(sex, gender) %>%
  reframe(count=n())

## 7. age: make with birthyear

diag_df <- tabyl(df1, age)
diag_df <- tabyl(ces_19_merging, birthyr)

ces_19_merging <- ces_19_merging %>%
  mutate(age = 2019 - birthyr)

diag_df <- ces_19_merging %>%
  group_by(age,
           birthyr) %>%
  reframe(count = n())

## 8. Mexican: make with national origin variable (?)

diag_df <- tabyl(df1, Mexican)

## Don't need, just make empty variable

ces_19_merging$Mexican <- NA

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_1,
#            CC20_hisp_2,
#            CC20_hisp_3,
#            CC20_hisp_4) %>%
#   reframe(count=n())

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_2) %>%
#   reframe(count=n())

## hisp_2 == US origin

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_3) %>%
#   reframe(count=n())

## hisp_3 == Mexican origin

# ces_20_merging <- ces_20_merging %>%
#   mutate(Mexican = case_when(
#     CC20_hisp_3 == 1 ~ 1,
#     CC20_hisp_3 == 2 ~ 0,
#     TRUE ~ NA
#   ))

# diag_df <- ces_20_merging %>%
#   group_by(Mexican) %>%
#   reframe(count=n())

## 9. USborn: make
## 10. USborn_parents: make
## 11. USborn_gparents: make

diag_df <- ces_19_merging %>%
  group_by(immstat,
           cit1) %>%
  reframe(count=n())


ces_19_merging <- ces_19_merging %>%
  mutate(USborn = ifelse(immstat ==1,
                         0,
                         1)
  )

ces_19_merging$USborn_parents <- NA
ces_19_merging$USborn_gparents <- NA

         
## Don't need anything but USborn

diag_df <- ces_19_merging %>%
  group_by(immstat,
           USborn,
           USborn_parents,
           USborn_gparents) %>%
  reframe(count=n())

## 12. Party = pid3_leaner
## 13. Party7 = pid7

diag_df <- ces_19_merging %>%
  group_by(pid3) %>%
  reframe(count=n())


ces_19_merging <- ces_19_merging %>%
  mutate(
    Party = case_when(
      pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      pid7 == 1 ~ "Strong Democrat",
      pid7 == 2 ~ "Weak Democrat",
      pid7 == 3 ~ "Lean Democrat",
      pid7 == 4 ~ "Independent",
      pid7 == 5 ~ "Lean Republican",
      pid7 == 6 ~ "Weak Republican",
      pid7 == 7 ~ "Strong Republican",
      TRUE ~ "Other"
    )
  )
    


diag_df <- ces_19_merging %>%
  group_by(Party,
           pid3,
           pid7) %>%
  reframe(count=n())
  
  

## 14. date. Leave NA for now

# diag_df <- ces_19 %>%
#   group_by(ccesmodule) %>%
#   reframe(count=n())

ces_19_merging$date <- "NA"


## 15. birthyear = birthyr

ces_19_merging <- ces_19_merging %>%
  dplyr::rename(birthyear = birthyr)


## 16. education

diag_df <- ces_19_merging %>%
  group_by(educ) %>%
  reframe(count=n())

diag_df <- df1 %>%
  group_by(education) %>%
  reframe(count=n())

ces_19_merging <- ces_19_merging %>%
  mutate(education = case_when(
    educ == 1 ~ "NoHs",
    educ == 2 ~ "HSOnly",
    educ == 3 ~ "SomeColl_Other",
    educ %in% c(4,5,6) ~ "CollGrad",
    TRUE ~ NA
  ))

diag_df <- ces_19_merging %>%
  group_by(education,
           educ) %>%
  reframe(count=n())



## 19. origin

diag_df <- df1 %>%
  group_by(origin,
           Survey,
           Year) %>%
  reframe(count=n())

diag_df <- ces_19 %>%
  group_by(CC19_353a_1,
             CC19_353a_2,
             CC19_353a_3,
             CC19_353a_4,
             CC19_353a_5,
             CC19_353a_6,
             CC19_353a_7,
             CC19_353a_8) %>%
  reframe(count=n())

## a_3 = Mexican
## a_4 = Puerto Rican
## a_5 = Cuban

ces_19_merging <- ces_19_merging %>%
  mutate(origin = case_when(
    CC19_353a_3 == 1 ~ "Mexican",
    CC19_353a_4 == 1 ~ "Puerto Rican",
    CC19_353a_5 == 1 ~ "Cuban",
    CC19_353a_6 == 1 ~ "Dominican",
    TRUE ~ "Other"
  ))

diag_df <- ces_19_merging %>%
  group_by(origin) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

ces_19_merging <- ces_19_merging %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA

ces_19_merging <- ces_19_merging %>%
  mutate(region = NA)


## 20: generation

ces_19_merging <- ces_19_merging %>%
    mutate(generation = case_when(
    birthyear < 1928 ~ "Greatest",
    birthyear > 1927 & 
      birthyear < 1946 ~ "Silent",
    birthyear > 1945 & 
      birthyear < 1965 ~ "Boomer",
    birthyear > 1964 & 
      birthyear < 1981 ~ "GenX",
    birthyear > 1980 & 
      birthyear < 1997 ~ "Millennial",
    birthyear > 1996 & 
      birthyear < 2012 ~ "GenZ",
    TRUE ~ "Missing")
  )

diag_df <- ces_19_merging %>%
  group_by(generation,
           birthyear) %>%
  reframe(count=n())



## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [19] "region"            "generation"

ces_19_merging <- ces_19_merging %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df_collected) == names(ces_19_merging)

df_collected <- rbind(df_collected,
                      ces_19_merging)

diag_df <- df_collected %>%
  group_by(Year,
           Survey) %>%
  reframe(count=n())


```


```{r CES 2020}

## 2020 CES data

ces_20_merging <- ces_20

## Variables to get:
names(df1)

#  [1] "Year"              "Survey"            "id"               
#  [4] "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"           
# [10] "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"
# [16] "education"         "origin"            "Weight_AllLatinos"
# [16] "region"            "generation" 

names(ces_20)

## Restrict to only Latino/Hispanic

ces_20_merging <- ces_20_merging %>%
  filter(race == 3 | hispanic == 1)

## Make new citizen

diag_df <- ces_20_merging %>%
  group_by(cit1) %>%
  reframe(count=n())

ces_20_merging <- ces_20_merging %>%
  mutate(citizen = if_else(cit1 == 1,
                           1,
                           0)
  )

## 1. Year: make new

diag_df <- tabyl(ces_20, birthyr)

ces_20_merging <- ces_20_merging %>%
  mutate(Year = 2020)

## 2. Survey: new

ces_20_merging <- ces_20_merging %>%
  mutate(Survey = "CES")

## 3. id = caseid

diag_df <- ces_20_merging %>%
  group_by(`...1`) %>%
  reframe(count=n())

ces_20_merging$id <- ces_20_merging$`...1`

## 4. Weight = weight

ces_20_merging <- ces_20_merging %>%
  dplyr::rename(Weight = commonweight)

## 5. state = inputstate (need to clean still)

diag_df <- tabyl(df1, state)
diag_df <- tabyl(ces_20, inputstate)

state_list <- tidycensus::fips_codes %>%
  select(state,
         state_code) %>%
  mutate(state_code = as.numeric(state_code)) %>%
  distinct()

ces_20_merging <- left_join(ces_20_merging,
                            state_list,
                            by = c("inputstate" = "state_code"))

diag_df <- ces_20_merging %>%
  group_by(state,
           inputstate) %>%
  reframe(count = n())

## 6. sex = gender

diag_df <- tabyl(df1, sex)
diag_df <- tabyl(ces_20, gender)

ces_20_merging <- ces_20_merging %>%
  mutate(sex = case_when(
    gender == 1 ~ "M",
    gender == 2 ~ "F",
    TRUE ~ NA
  ))

## 7. age: make with birthyear

diag_df <- tabyl(df1, age)
diag_df <- tabyl(ces_20, birthyr)

ces_20_merging <- ces_20_merging %>%
  mutate(age = 2020 - birthyr)

diag_df <- ces_20_merging %>%
  group_by(age,
           birthyr) %>%
  reframe(count = n())

## 8. Mexican: make with national origin variable (?)

diag_df <- tabyl(df1, Mexican)

diag_df <- ces_20 %>%
  group_by(CC20_hisp_1,
           CC20_hisp_2,
           CC20_hisp_3,
           CC20_hisp_4) %>%
  reframe(count=n())

diag_df <- ces_20 %>%
  group_by(CC20_hisp_2) %>%
  reframe(count=n())

## hisp_2 == US origin

diag_df <- ces_20 %>%
  group_by(CC20_hisp_3) %>%
  reframe(count=n())

## hisp_3 == Mexican origin

ces_20_merging <- ces_20_merging %>%
  mutate(Mexican = case_when(
    CC20_hisp_3 == 1 ~ 1,
    CC20_hisp_3 == 2 ~ 0,
    TRUE ~ NA
  ))

diag_df <- ces_20_merging %>%
  group_by(Mexican) %>%
  reframe(count=n())

## 9. USborn: make
## 10. USborn_parents: make
## 11. USborn_gparents: make

diag_df <- ces_20_merging %>%
  group_by(immstat,
           cit1) %>%
  reframe(count=n())


ces_20_merging <- ces_20_merging %>%
  mutate(USborn = ifelse(immstat %in% c(3,4,5),
                         1,
                         0),
         USborn_parents = ifelse(immstat %in% c(3),
                                 1,
                                 0),
         USborn_gparents= ifelse(immstat %in% c(5),
                                 1,
                                 0)
  )

diag_df <- ces_20_merging %>%
  group_by(immstat,
           USborn,
           USborn_parents,
           USborn_gparents) %>%
  reframe(count=n())


## 12. Party = pid3_leaner
## 13. Party7 = pid7

diag2_df <- ces_20 %>%
  group_by(pid7) %>%
  reframe(count=n())

ces_20_merging <- ces_20_merging %>%
  mutate(
    Party = case_when(
      pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      pid7 == 1 ~ "Strong Democrat",
      pid7 == 2 ~ "Weak Democrat",
      pid7 == 3 ~ "Lean Democrat",
      pid7 == 4 ~ "Independent",
      pid7 == 5 ~ "Lean Republican",
      pid7 == 6 ~ "Weak Republican",
      pid7 == 7 ~ "Strong Republican",
      TRUE ~ "Other"
    )
  )


diag_df <- ces_20_merging %>%
  group_by(Party,
           pid3,
           pid7) %>%
  reframe(count=n())
  
  

## 14. date. Leave NA for now

diag_df <- ces_20 %>%
  group_by(ccesmodule) %>%
  reframe(count=n())

ces_20_merging$date <- "NA"


## 15. birthyear = birthyr

ces_20_merging <- ces_20_merging %>%
  dplyr::rename(birthyear = birthyr)


## 16. education

diag_df <- ces_20 %>%
  group_by(educ) %>%
  reframe(count=n())

diag_df <- df1 %>%
  group_by(education) %>%
  reframe(count=n())

ces_20_merging <- ces_20_merging %>%
  mutate(education = case_when(
    educ == 1 ~ "NoHs",
    educ == 2 ~ "HSOnly",
    educ == 3 ~ "SomeColl_Other",
    educ %in% c(4,5,6) ~ "CollGrad",
    TRUE ~ NA
  ))

diag_df <- ces_20_merging %>%
  group_by(education,
           educ) %>%
  reframe(count=n())



## 17. origin

diag_df <- df1 %>%
  group_by(origin,
           Survey,
           Year) %>%
  reframe(count=n())

diag_df <- ces_20_merging %>%
  group_by(CC20_hisp_3,
           CC20_hisp_4,
           CC20_hisp_5,
           CC20_hisp_6,
           CC20_hisp_7) %>%
  reframe(count=n())
 
## hisp_3 = Mexican
## hisp_4 = Puerto Rican
## hisp_5 = Cuban

ces_20_merging <- ces_20_merging %>%
  mutate(origin = case_when(
    CC20_hisp_3 == 1 ~ "Mexican",
    CC20_hisp_4 == 1 ~ "Puerto Rican",
    CC20_hisp_5 == 1 ~ "Cuban",
    CC20_hisp_6 == 1 ~ "Dominican",
    TRUE ~ "Other"
  ))

diag_df <- ces_20_merging %>%
  group_by(origin, Mexican) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

ces_20_merging <- ces_20_merging %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA

ces_20_merging <- ces_20_merging %>%
  mutate(region = NA)


## 20: generation

ces_20_merging <- ces_20_merging %>%
    mutate(generation = case_when(
    birthyear < 1928 ~ "Greatest",
    birthyear > 1927 & 
      birthyear < 1946 ~ "Silent",
    birthyear > 1945 & 
      birthyear < 1965 ~ "Boomer",
    birthyear > 1964 & 
      birthyear < 1981 ~ "GenX",
    birthyear > 1980 & 
      birthyear < 1997 ~ "Millennial",
    birthyear > 1996 & 
      birthyear < 2012 ~ "GenZ",
    TRUE ~ "Missing")
  )

diag_df <- ces_20_merging %>%
  group_by(generation,
           birthyear) %>%
  reframe(count=n())



## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [19] "region"            "generation"

ces_20_merging <- ces_20_merging %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df1) == names(ces_20_merging)

df_collected <- rbind(df_collected,
                      ces_20_merging)

diag_df <- df_collected %>%
  group_by(Year,
           Survey) %>%
  reframe(count=n())



```


```{r CES 2021}

## 2021

## Easier going year-by-year (cumulative misses some variables)

ces_21_merging <- ces_21

## Variables to get

names(ces_21_merging)

#  [1] "Year"              "Survey"            "id"               
#  [4] "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"           
# [10] "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"
# [16] "education"         "origin"            "Weight_AllLatinos"
# [16] "region"            "generation" 


## Restrict to only Latino/Hispanic

diag_df <- ces_21_merging %>%
  group_by(race, hispanic) %>%
  reframe(count=n())

ces_21_merging <- ces_21_merging %>%
  filter(race == 3 | hispanic == 1)

## Make citizens

ces_21_merging <- ces_21_merging %>%
  mutate(citizen = if_else(cit1 == 1,
                           1,
                           0)
  )

## 1. Year: make new

# diag_df <- ces_21_merging %>%
#   group_by(year) %>%
#   reframe(count=n())

ces_21_merging$Year <- 2021

## 2. Survey: new

ces_21_merging <- ces_21_merging %>%
  mutate(Survey = "CES")

## 3. id = V101

diag_df <- ces_21_merging %>%
  group_by(caseid) %>%
  reframe(count=n())

ces_21_merging$id <- ces_21_merging$caseid

## 4. Weight = weight

ces_21_merging <- ces_21_merging %>%
  dplyr::rename(Weight = commonweight)

## 5. state = inputstate (need to clean still)

diag_df <- tabyl(df1, state)
diag_df <- tabyl(ces_21_merging, inputstate)

state_list <- tidycensus::fips_codes %>%
  select(state,
         state_code) %>%
  mutate(state_code = as.numeric(state_code)) %>%
  distinct()

ces_21_merging <- left_join(ces_21_merging,
                            state_list,
                            by = c("inputstate" = "state_code"))


diag_df <- ces_21_merging %>%
  group_by(inputstate,
           state) %>%
  reframe(count = n())

## 6. sex = gender4

diag_df <- tabyl(df1, sex)
diag_df <- tabyl(ces_21_merging, gender4)


ces_21_merging <- ces_21_merging %>%
  mutate(sex = case_when(
    gender4 == 1 ~ "M",
    gender4 == 2 ~ "F",
    TRUE ~ NA
  ))

diag_df <- ces_21_merging %>%
  group_by(sex, gender4) %>%
  reframe(count=n())

## 7. age: make with birthyear

diag_df <- tabyl(df1, age)
diag_df <- tabyl(ces_21_merging, birthyr)

ces_21_merging <- ces_21_merging %>%
  mutate(age = 2021 - birthyr)

diag_df <- ces_21_merging %>%
  group_by(age,
           birthyr) %>%
  reframe(count = n())

## 8. Mexican: make with national origin variable (?)

diag_df <- tabyl(df1, Mexican)

## Don't need, just make empty variable

ces_21_merging$Mexican <- NA

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_1,
#            CC20_hisp_2,
#            CC20_hisp_3,
#            CC20_hisp_4) %>%
#   reframe(count=n())

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_2) %>%
#   reframe(count=n())

## hisp_2 == US origin

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_3) %>%
#   reframe(count=n())

## hisp_3 == Mexican origin

# ces_20_merging <- ces_20_merging %>%
#   mutate(Mexican = case_when(
#     CC20_hisp_3 == 1 ~ 1,
#     CC20_hisp_3 == 2 ~ 0,
#     TRUE ~ NA
#   ))

# diag_df <- ces_20_merging %>%
#   group_by(Mexican) %>%
#   reframe(count=n())

## 9. USborn: make
## 10. USborn_parents: make
## 11. USborn_gparents: make

diag_df <- ces_21_merging %>%
  group_by(immstat,
           cit1) %>%
  reframe(count=n())


ces_21_merging <- ces_21_merging %>%
  mutate(USborn = ifelse(immstat ==1,
                         0,
                         1)
  )

ces_21_merging$USborn_parents <- NA
ces_21_merging$USborn_gparents <- NA

         
## Don't need anything but USborn

diag_df <- ces_21_merging %>%
  group_by(immstat,
           USborn,
           USborn_parents,
           USborn_gparents) %>%
  reframe(count=n())

## 12. Party = pid3_leaner
## 13. Party7 = pid7

diag_df <- ces_21_merging %>%
  group_by(pid3) %>%
  reframe(count=n())


ces_21_merging <- ces_21_merging %>%
  mutate(
    Party = case_when(
      pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      pid7 == 1 ~ "Strong Democrat",
      pid7 == 2 ~ "Weak Democrat",
      pid7 == 3 ~ "Lean Democrat",
      pid7 == 4 ~ "Independent",
      pid7 == 5 ~ "Lean Republican",
      pid7 == 6 ~ "Weak Republican",
      pid7 == 7 ~ "Strong Republican",
      TRUE ~ "Other"
    )
  )

diag_df <- ces_21_merging %>%
  group_by(Party,
           pid3,
           Party7) %>%
  reframe(count=n())
  

## 14. date. Leave NA for now

# diag_df <- ces_21 %>%
#   group_by(ccesmodule) %>%
#   reframe(count=n())

ces_21_merging$date <- "NA"


## 15. birthyear = birthyr

ces_21_merging <- ces_21_merging %>%
  dplyr::rename(birthyear = birthyr)


## 16. education

diag_df <- ces_21_merging %>%
  group_by(educ) %>%
  reframe(count=n())

diag_df <- df1 %>%
  group_by(education) %>%
  reframe(count=n())

ces_21_merging <- ces_21_merging %>%
  mutate(education = case_when(
    educ == 1 ~ "NoHs",
    educ == 2 ~ "HSOnly",
    educ == 3 ~ "SomeColl_Other",
    educ %in% c(4,5,6) ~ "CollGrad",
    TRUE ~ NA
  ))

diag_df <- ces_21_merging %>%
  group_by(education,
           educ) %>%
  reframe(count=n())



## 21. origin

diag_df <- df1 %>%
  group_by(origin,
           Survey,
           Year) %>%
  reframe(count=n())

diag_df <- ces_21 %>%
  group_by(CC21_hisp_1,
             CC21_hisp_2,
             CC21_hisp_3,
             CC21_hisp_4,
             CC21_hisp_5) %>%
  reframe(count=n())

## a_3 = Mexican
## a_4 = Puerto Rican
## a_5 = Cuban

ces_21_merging <- ces_21_merging %>%
  mutate(origin = case_when(
    CC21_hisp_3 == 1 ~ "Mexican",
    CC21_hisp_4 == 1 ~ "Puerto Rican",
    CC21_hisp_5 == 1 ~ "Cuban",
    CC21_hisp_6 == 1 ~ "Dominican",
    TRUE ~ "Other"
  ))

diag_df <- ces_21_merging %>%
  group_by(origin) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

ces_21_merging <- ces_21_merging %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA

ces_21_merging <- ces_21_merging %>%
  mutate(region = NA)


## 20: generation

ces_21_merging <- ces_21_merging %>%
    mutate(generation = case_when(
    birthyear < 1928 ~ "Greatest",
    birthyear > 1927 & 
      birthyear < 1946 ~ "Silent",
    birthyear > 1945 & 
      birthyear < 1965 ~ "Boomer",
    birthyear > 1964 & 
      birthyear < 1981 ~ "GenX",
    birthyear > 1980 & 
      birthyear < 1997 ~ "Millennial",
    birthyear > 1996 & 
      birthyear < 1912 ~ "GenZ",
    TRUE ~ "Missing")
  )

diag_df <- ces_21_merging %>%
  group_by(generation,
           birthyear) %>%
  reframe(count=n())



## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [21] "region"            "generation"

ces_21_merging <- ces_21_merging %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df_collected) == names(ces_21_merging)

df_collected <- rbind(df_collected,
                      ces_21_merging)

diag_df <- df_collected %>%
  group_by(Year,
           Survey) %>%
  reframe(count=n())


```


```{r CES 2022}


### 2022 dataset

glimpse(ces_22)

ces_22_merging <- ces_22

names(ces_22)


## Restrict to only Latino/Hispanic

ces_22_merging <- ces_22_merging %>%
  filter(race == 3 | hispanic == 1)

## Make citizens

ces_22_merging <- ces_22_merging %>%
  mutate(citizen = if_else(cit1 == 1,
                           1,
                           0)
  )

## 1. Year: make new

diag_df <- tabyl(ces_22, birthyr)

ces_22_merging <- ces_22_merging %>%
  mutate(Year = 2022)

## 2. Survey: new

ces_22_merging <- ces_22_merging %>%
  mutate(Survey = "CES")

## 3. id = caseid

diag_df <- ces_22_merging %>%
  group_by(caseid) %>%
  reframe(count=n())

ces_22_merging$id <- ces_22_merging$caseid

## 4. Weight = weight

ces_22_merging <- ces_22_merging %>%
  dplyr::rename(Weight = commonweight)

## 5. state = inputstate (need to clean still)

diag_df <- tabyl(df1, state)
diag_df <- tabyl(ces_22, inputstate)

state_list <- tidycensus::fips_codes %>%
  select(state,
         state_code) %>%
  mutate(state_code = as.numeric(state_code)) %>%
  distinct()

ces_22_merging <- left_join(ces_22_merging,
                            state_list,
                            by = c("inputstate" = "state_code"))

diag_df <- ces_22_merging %>%
  group_by(state,
           inputstate) %>%
  reframe(count = n())

## 6. sex = gender

diag_df <- tabyl(df1, sex)

diag_df <- ces_22_merging %>%
  group_by(gender4) %>%
  reframe(count=n())

ces_22_merging <- ces_22_merging %>%
  mutate(sex = case_when(
    gender4 == 1 ~ "M",
    gender4 == 2 ~ "F",
    TRUE ~ "Other"
  ))

diag_df <- ces_22_merging %>%
  group_by(gender4,
           sex) %>%
  reframe(count=n())

## 7. age: make with birthyear

diag_df <- tabyl(df1, age)
diag_df <- tabyl(ces_22, birthyr)

ces_22_merging <- ces_22_merging %>%
  mutate(age = 2022 - birthyr)

diag_df <- ces_22_merging %>%
  group_by(age,
           birthyr) %>%
  reframe(count = n())

## 8. Mexican: make with national origin variable (?)

diag_df <- tabyl(df1, Mexican)

diag_df <- ces_22 %>%
  group_by(CC22_hisp_1,
           CC22_hisp_2,
           CC22_hisp_3,
           CC22_hisp_4) %>%
  reframe(count=n())

diag_df <- ces_22 %>%
  group_by(CC22_hisp_2) %>%
  reframe(count=n())

## hisp_2 == US origin

diag_df <- ces_22 %>%
  group_by(CC22_hisp_3) %>%
  reframe(count=n())

## hisp_3 == Mexican origin

ces_22_merging <- ces_22_merging %>%
  mutate(Mexican = case_when(
    CC22_hisp_3 == 1 ~ 1,
    CC22_hisp_3 == 2 ~ 0,
    TRUE ~ NA
  ))

diag_df <- ces_22_merging %>%
  group_by(Mexican) %>%
  reframe(count=n())

## 9. USborn: make
## 10. USborn_parents: make
## 11. USborn_gparents: make

diag_df <- ces_22_merging %>%
  group_by(immstat,
           cit1) %>%
  reframe(count=n())

ces_22_merging <- ces_22_merging %>%
  mutate(USborn = ifelse(immstat %in% c(3,4,5),
                         1,
                         0),
         USborn_parents = ifelse(immstat %in% c(3),
                                 1,
                                 0),
         USborn_gparents= ifelse(immstat %in% c(5),
                                 1,
                                 0)
  )

diag_df <- ces_22_merging %>%
  group_by(immstat,
           USborn,
           USborn_parents,
           USborn_gparents) %>%
  reframe(count=n())


## 12. Party = pid3_leaner
## 13. Party7 = pid7

diag_df <- ces_22_merging %>%
  group_by(pid3) %>%
  reframe(count=n())

ces_22_merging <- ces_22_merging %>%
  mutate(
    Party = case_when(
      pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      pid7 == 1 ~ "Strong Democrat",
      pid7 == 2 ~ "Weak Democrat",
      pid7 == 3 ~ "Lean Democrat",
      pid7 == 4 ~ "Independent",
      pid7 == 5 ~ "Lean Republican",
      pid7 == 6 ~ "Weak Republican",
      pid7 == 7 ~ "Strong Republican",
      TRUE ~ "Other"
    )
  )

diag_df <- ces_22_merging %>%
  group_by(Party,
           pid3,
           Party7) %>%
  reframe(count=n())
  

## 14. date. Leave NA for now

diag_df <- ces_22 %>%
  group_by(ccesmodule) %>%
  reframe(count=n())

ces_22_merging$date <- "NA"


## 15. birthyear = birthyr

ces_22_merging <- ces_22_merging %>%
  dplyr::rename(birthyear = birthyr)


## 16. education

diag_df <- ces_22 %>%
  group_by(educ) %>%
  reframe(count=n())

diag_df <- df1 %>%
  group_by(education) %>%
  reframe(count=n())

ces_22_merging <- ces_22_merging %>%
  mutate(education = case_when(
    educ == 1 ~ "NoHs",
    educ == 2 ~ "HSOnly",
    educ == 3 ~ "SomeColl_Other",
    educ %in% c(4,5,6) ~ "CollGrad",
    TRUE ~ NA
  ))

diag_df <- ces_22_merging %>%
  group_by(education,
           educ) %>%
  reframe(count=n())



## 17. origin

diag_df <- df1 %>%
  group_by(origin,
           Survey,
           Year) %>%
  reframe(count=n())

## No Latino origin for CCES 2006-2015

## hisp_3 = Mexican
## hisp_4 = Puerto Rican
## hisp_5 = Cuban

diag_df <- ces_22_merging %>%
  group_by(CC22_hisp_3,
           CC22_hisp_4,
           CC22_hisp_5) %>%
  reframe(count = n())

ces_22_merging <- ces_22_merging %>%
  mutate(origin = case_when(
    CC22_hisp_3 == 1 ~ "Mexican",
    CC22_hisp_4 == 1 ~ "Puerto Rican",
    CC22_hisp_5 == 1 ~ "Cuban",
    CC22_hisp_6 == 1 ~ "Dominican",
    TRUE ~ "Other"
  ))

diag_df <- ces_22_merging %>%
  group_by(origin, Mexican) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

ces_22_merging <- ces_22_merging %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA

ces_22_merging <- ces_22_merging %>%
  mutate(region = NA)


## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [19] "region"            "generation"

ces_22_merging <- ces_22_merging %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df_collected) == names(ces_22_merging)

df_collected <- rbind(df_collected,
                      ces_22_merging)

```



```{r CES 2023}

## 2023

## Easier going year-by-year (cumulative misses some variables)

ces_23_merging <- ces_23

## Variables to get

names(ces_23_merging)

#  [1] "Year"              "Survey"            "id"               
#  [4] "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"           
# [10] "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"
# [16] "education"         "origin"            "Weight_AllLatinos"
# [16] "region"            "generation" 


## Restrict to only Latino/Hispanic

diag_df <- ces_23_merging %>%
  group_by(race, hispanic) %>%
  reframe(count=n())

ces_23_merging <- ces_23_merging %>%
  filter(race == 3 | hispanic == 1)

## Restrict to only citizens

ces_23_merging <- ces_23_merging %>%
  mutate(citizen = if_else(cit1 == 1,
                           1,
                           0))

## 1. Year: make new

# diag_df <- ces_21_merging %>%
#   group_by(year) %>%
#   reframe(count=n())

ces_23_merging$Year <- 2023

## 2. Survey: new

ces_23_merging <- ces_23_merging %>%
  mutate(Survey = "CES")

## 3. id = V101

diag_df <- ces_23_merging %>%
  group_by(caseid) %>%
  reframe(count=n())

ces_23_merging$id <- ces_23_merging$caseid

## 4. Weight = weight

ces_23_merging <- ces_23_merging %>%
  dplyr::rename(Weight = commonweight)

## 5. state = inputstate (need to clean still)

diag_df <- tabyl(df1, state)
diag_df <- tabyl(ces_23_merging, inputstate)

state_list <- tidycensus::fips_codes %>%
  select(state,
         state_code) %>%
  mutate(state_code = as.numeric(state_code)) %>%
  distinct()

ces_23_merging <- left_join(ces_23_merging,
                            state_list,
                            by = c("inputstate" = "state_code"))


diag_df <- ces_23_merging %>%
  group_by(inputstate,
           state) %>%
  reframe(count = n())

## 6. sex = gender4

diag_df <- tabyl(df1, sex)
diag_df <- tabyl(ces_23_merging, gender4)


ces_23_merging <- ces_23_merging %>%
  mutate(sex = case_when(
    gender4 == 1 ~ "M",
    gender4 == 2 ~ "F",
    TRUE ~ NA
  ))

diag_df <- ces_23_merging %>%
  group_by(sex, gender4) %>%
  reframe(count=n())

## 7. age: make with birthyear

diag_df <- tabyl(df1, age)
diag_df <- tabyl(ces_23_merging, birthyr)

ces_23_merging <- ces_23_merging %>%
  mutate(age = 2023 - birthyr)

diag_df <- ces_23_merging %>%
  group_by(age,
           birthyr) %>%
  reframe(count = n())

## 8. Mexican: make with national origin variable (?)

diag_df <- tabyl(df1, Mexican)

## Don't need, just make empty variable

ces_23_merging$Mexican <- NA

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_1,
#            CC20_hisp_2,
#            CC20_hisp_3,
#            CC20_hisp_4) %>%
#   reframe(count=n())

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_2) %>%
#   reframe(count=n())

## hisp_2 == US origin

# diag_df <- ces_20 %>%
#   group_by(CC20_hisp_3) %>%
#   reframe(count=n())

## hisp_3 == Mexican origin

# ces_20_merging <- ces_20_merging %>%
#   mutate(Mexican = case_when(
#     CC20_hisp_3 == 1 ~ 1,
#     CC20_hisp_3 == 2 ~ 0,
#     TRUE ~ NA
#   ))

# diag_df <- ces_20_merging %>%
#   group_by(Mexican) %>%
#   reframe(count=n())

## 9. USborn: make
## 10. USborn_parents: make
## 11. USborn_gparents: make

diag_df <- ces_23_merging %>%
  group_by(immstat,
           cit1) %>%
  reframe(count=n())


ces_23_merging <- ces_23_merging %>%
  mutate(USborn = ifelse(immstat ==1,
                         0,
                         1)
  )

ces_23_merging$USborn_parents <- NA
ces_23_merging$USborn_gparents <- NA

         
## Don't need anything but USborn

diag_df <- ces_23_merging %>%
  group_by(immstat,
           USborn,
           USborn_parents,
           USborn_gparents) %>%
  reframe(count=n())

## 12. Party = pid3_leaner
## 13. Party7 = pid7

diag_df <- ces_23_merging %>%
  group_by(pid3) %>%
  reframe(count=n())


ces_23_merging <- ces_23_merging %>%
  mutate(
    Party = case_when(
      pid3 == 1 | pid7 %in% c(1,2,3) ~ "Democrat",
      pid3 == 2 | pid7 %in% c(5,6,7) ~ "Republican",
      pid3 == 3 | pid7 == 4 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      pid7 == 1 ~ "Strong Democrat",
      pid7 == 2 ~ "Weak Democrat",
      pid7 == 3 ~ "Lean Democrat",
      pid7 == 4 ~ "Independent",
      pid7 == 5 ~ "Lean Republican",
      pid7 == 6 ~ "Weak Republican",
      pid7 == 7 ~ "Strong Republican",
      TRUE ~ "Other"
    )
  )

diag_df <- ces_23_merging %>%
  group_by(Party,
           pid3,
           Party7) %>%
  reframe(count=n())
  

## 14. date. Leave NA for now

# diag_df <- ces_21 %>%
#   group_by(ccesmodule) %>%
#   reframe(count=n())

ces_23_merging$date <- "NA"


## 15. birthyear = birthyr

ces_23_merging <- ces_23_merging %>%
  dplyr::rename(birthyear = birthyr)


## 16. education

diag_df <- ces_23_merging %>%
  group_by(educ) %>%
  reframe(count=n())

diag_df <- df1 %>%
  group_by(education) %>%
  reframe(count=n())

ces_23_merging <- ces_23_merging %>%
  mutate(education = case_when(
    educ == 1 ~ "NoHs",
    educ == 2 ~ "HSOnly",
    educ == 3 ~ "SomeColl_Other",
    educ %in% c(4,5,6) ~ "CollGrad",
    TRUE ~ NA
  ))

diag_df <- ces_23_merging %>%
  group_by(education,
           educ) %>%
  reframe(count=n())



## 21. origin

diag_df <- df1 %>%
  group_by(origin,
           Survey,
           Year) %>%
  reframe(count=n())

diag_df <- ces_23 %>%
  group_by(CC23_hisp_1,
             CC23_hisp_2,
             CC23_hisp_3,
             CC23_hisp_4,
             CC23_hisp_5) %>%
  reframe(count=n())

## a_3 = Mexican
## a_4 = Puerto Rican
## a_5 = Cuban

ces_23_merging <- ces_23_merging %>%
  mutate(origin = case_when(
    CC23_hisp_3 == 1 ~ "Mexican",
    CC23_hisp_4 == 1 ~ "Puerto Rican",
    CC23_hisp_5 == 1 ~ "Cuban",
    CC23_hisp_6 == 1 ~ "Dominican",
    TRUE ~ "Other"
  ))

diag_df <- ces_23_merging %>%
  group_by(origin) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

ces_23_merging <- ces_23_merging %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA

ces_23_merging <- ces_23_merging %>%
  mutate(region = NA)


## 20: generation

ces_23_merging <- ces_23_merging %>%
    mutate(generation = case_when(
    birthyear < 1928 ~ "Greatest",
    birthyear > 1927 & 
      birthyear < 1946 ~ "Silent",
    birthyear > 1945 & 
      birthyear < 1965 ~ "Boomer",
    birthyear > 1964 & 
      birthyear < 1981 ~ "GenX",
    birthyear > 1980 & 
      birthyear < 1997 ~ "Millennial",
    birthyear > 1996 & 
      birthyear < 1912 ~ "GenZ",
    TRUE ~ "Missing")
  )

diag_df <- ces_23_merging %>%
  group_by(generation,
           birthyear) %>%
  reframe(count=n())



## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [21] "region"            "generation"

ces_23_merging <- ces_23_merging %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df_collected) == names(ces_23_merging)

df_collected <- rbind(df_collected,
                      ces_23_merging)

diag_df <- df_collected %>%
  group_by(Year,
           Survey) %>%
  reframe(count=n())


```





```{r CMPS 2016}

CMPS <- cmps_16

## Restrict to Latino

diag_df <- CMPS %>%
  group_by(s2_2) %>% 
  reframe(count=n())

CMPS <- subset(CMPS, s2_2 == 1)

## Make citizen

diag_df <- CMPS %>%
  group_by(c375) %>%
  reframe(count=n())

CMPS <- CMPS %>%
  mutate(citizen = if_else(is.na(c375) == TRUE | c375 == 1,
                           1,
                           0)
  )

CMPS$sex <- ""
CMPS$sex[CMPS$s3 == 1] <- "M"
CMPS$sex[CMPS$s3 == 2] <- "F"

#Coding for Mexican yes=1 No=0
CMPS$Mexican <- 0
CMPS$Mexican[CMPS$s10 == 12] <- 1
CMPS$s10_21_other[CMPS$s10_21_other == "M\xe9xico and El Salvador"] <- "Mexico and El Salvador" # might be mac-only
CMPS$s10_21_other[CMPS$s10_21_other == "guardalajara"] <- "Mexico"
CMPS$Mexican[CMPS$s10_21_other %in% grep("xico",CMPS$s10_21_other, value=TRUE)] <- 1
CMPS$Mexican[CMPS$s10_21_other=="new mexico"]<-0

#Coding for US Born
CMPS$USborn <-0
CMPS$USborn[CMPS$s7==1] <-1 #Doesn't include Puerto Rico (s7=3) - 1 Mexican was born in PR

CMPS$USborn_parents <- NA
CMPS$USborn_parents[CMPS$USborn == 0 | CMPS$c377 %in% c(2,3)] <- 0
CMPS$USborn_parents[CMPS$c377 %in% c(4)] <- 1
CMPS$USborn_parents[CMPS$c377 %in% c(1)] <- 2

#Coding for Generation (at least 1 Grandparent born outside U.S.)
CMPS$USborn_gparents <- NA
CMPS$USborn_gparents[CMPS$c379 == 1] <- 4
CMPS$USborn_gparents[CMPS$c379 == 2] <- 3
CMPS$USborn_gparents[CMPS$c379 == 3] <- 2
CMPS$USborn_gparents[CMPS$c379 == 4] <- 1
CMPS$USborn_gparents[CMPS$c379 == 5 | CMPS$USborn_parents == 0] <- 0

diag_df <- CMPS %>%
  group_by(c25, c26, c27) %>%
  reframe(count=n())

CMPS$Party <-NA
CMPS$Party[CMPS$c25==2 | CMPS$c27==2] <- "Democrat"
CMPS$Party[CMPS$c25==1 | CMPS$c27==1] <- "Republican"
CMPS$Party[CMPS$c25==3 & is.na(CMPS$Party)] <- "Independent"
CMPS$Party[CMPS$c25 %in% c(4, 88) & is.na(CMPS$Party)] <- "Other"
CMPS$Party7 <- CMPS$Party

CMPS$Party7[CMPS$c26 == 1 & CMPS$Party == "Democrat"] <- "Strong Democrat"
CMPS$Party7[CMPS$c26 == 1 & CMPS$Party == "Republican"] <- "Strong Republican"
CMPS$Party7[CMPS$c26 == 2 & CMPS$Party == "Democrat"] <- "Weak Democrat"
CMPS$Party7[CMPS$c26 == 2 & CMPS$Party == "Republican"] <- "Weak Republican"
CMPS$Party7[CMPS$c27 == 1] <- "Lean Republican"
CMPS$Party7[CMPS$c27 == 2] <- "Lean Democrat"
CMPS$Party7[CMPS$c27 == 3] <- "Independent"
CMPS$Party7[CMPS$c27 %in% c(4, 88)] <- "Other"

diag2_df <- CMPS %>%
  group_by(Party, Party7, c25, c26, c27) %>%
  reframe(count=n())


CMPS$state <- CMPS$s4
CMPS$region <- ""

CMPS$Year <- 2016
CMPS$Survey <- "CMPS"
CMPS$id <- CMPS$respid
CMPS$Weight <- CMPS$weight  #subgroup (Mexican) specific
CMPS$Weight_AllLatinos <- NA

CMPS$date <- CMPS$interview_end
CMPS$birthyear <- CMPS$s6 + 1900
CMPS$birthyear[CMPS$birthyear == 1900] <- NA # left birth year blank

CMPS$education <- NA
CMPS$education[CMPS$c381 %in% c(1,2)] <- "NoHS"
CMPS$education[CMPS$c381 %in% c(3)] <- "HSOnly"
CMPS$education[CMPS$c381 %in% c(4)] <- "SomeColl_Other"
CMPS$education[CMPS$c381 %in% c(5,6)] <- "CollGrad"

CMPS$origin <- "Other"
CMPS$origin[CMPS$s10 == 12] <- "Mexican"
CMPS$origin[CMPS$s10 == 17] <- "Puerto Rican"
CMPS$origin[CMPS$s10 == 6] <- "Cuban"
CMPS$origin[CMPS$s10 == 9] <- "Salvadoran"
CMPS$origin[CMPS$s10 == 7] <- "Dominican"


CMPS <- subset(CMPS, select=c(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen))

df_collected <- rbind(df_collected,
                      CMPS)

```



```{r CMPS 2020}

## CMPS 2020

# Variables to get:

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [19] "region"            "generation"


## CMPS 2020

cmps_2020 <- CMPS_2020_primary_sample_weighted

names(cmps_2020)

## Restrict to only Latino/Hispanic

diag_df <- cmps_2020 %>%
  group_by(S2_Race_Prime,
           S2_Hispanicr1,
           S2_Hispanicr2) %>%
  reframe(count=n())

## S2_Race_Prime == 2 ~ Latino
## S2_Hispanicr2 == 2 ~ Latino

cmps_2020 <- cmps_2020 %>%
  filter(S2_Race_Prime == 2 |
         S2_Hispanicr2 == 2 )


## Make citizen

diag_df <- cmps_2020 %>%
  group_by(Q807) %>%
  reframe(count=n())

cmps_2020 <- cmps_2020 %>%
  mutate(citizen = if_else(Q807 == 1 | is.na(Q807) == TRUE,
                           1,
                           0)
  )

## 1. Year: make new

cmps_2020$Year <- 2020

## 2. Survey: new

cmps_2020$Survey <- "CMPS"

## 3. id = uuid

diag_df <- cmps_2020 %>%
  group_by(uuid) %>%
  reframe(count=n())

cmps_2020$id <- cmps_2020$uuid

## 4. Weight = weight

cmps_2020 <- cmps_2020 %>%
  dplyr::rename(Weight = weight)

## 5. state = S4 (need to clean still)

diag_df <- cmps_2020 %>%
  group_by(S4) %>%
  reframe(count=n())

## Doesn't use FIPS, have to manually assign:

cmps_2020 <- cmps_2020 %>%
  mutate(state = case_when(
    S4 == 1 ~ "AL",
    S4 == 2 ~ "AK",
    S4 == 3 ~ "AZ",
    S4 == 4 ~ "AR",
    S4 == 5 ~ "CA",
    S4 == 6 ~ "CO",
    S4 == 7 ~ "CT",
    S4 == 8 ~ "DC",
    S4 == 9 ~ "DE",
    S4 == 10~ "FL",
    S4 == 11~ "GA",
    S4 == 12~ "HI",
    S4 == 13~ "ID",
    S4 == 14~ "IL",
    S4 == 15~ "IN",
    S4 == 16~ "IA",
    S4 == 17~ "KS",
    S4 == 18~ "KY",
    S4 == 19~ "LA",
    S4 == 20~ "ME",
    S4 == 21~ "MD",
    S4 == 22~ "MA",
    S4 == 23~ "MI",
    S4 == 24~ "MN",
    S4 == 25~ "MS",
    S4 == 26~ "MO",
    S4 == 27~ "MT",
    S4 == 28~ "NE",
    S4 == 29~ "NV",
    S4 == 30~ "NH",
    S4 == 31~ "NJ",
    S4 == 32~ "NM",
    S4 == 33~ "NY",
    S4 == 34~ "NC",
    S4 == 35~ "ND",
    S4 == 36~ "OH",
    S4 == 37~ "OK",
    S4 == 38~ "OR",
    S4 == 39~ "PA",
    S4 == 40~ "RI",
    S4 == 41~ "SC",
    S4 == 42~ "SD",
    S4 == 43~ "TN",
    S4 == 44~ "TX",
    S4 == 45~ "UT",
    S4 == 46~ "VT",
    S4 == 47~ "VA",
    S4 == 48~ "WA",
    S4 == 49~ "WV",
    S4 == 50~ "WI",
    S4 == 51~ "WY"
  ))

diag_df <- cmps_2020 %>%
  group_by(S4,
           state) %>%
  reframe(count=n())

## 6. sex = S3b

diag_df <- cmps_2020 %>% 
  group_by(S3b) %>%
  reframe(count=n())

cmps_2020 <- cmps_2020 %>%
  mutate(sex = case_when(
    S3b == 1 ~ "M",
    S3b == 2 ~ "F",
    TRUE ~ "Other"
  ))

diag_df <- cmps_2020 %>%
  group_by(S3b,
           sex) %>%
  reframe(count=n())

## 7. age: S5


diag_df <- tabyl(cmps_2020, S5)

## Make using birth year (S5)

cmps_2020 <- cmps_2020 %>%
  mutate(birthyear = S5,
         age = 2020 - S5)

diag_df <- cmps_2020 %>%
  group_by(birthyear,
           age) %>%
  reframe(count=n())



## 8. Mexican: S10

diag_df <- tabyl(df1, Mexican)

diag_df <- cmps_2020 %>%
  group_by(S10, S10_Mex) %>%
  reframe(count=n())

## S10: Mexican = 12
## S10_Mex: 1 = Mexican ancestry


cmps_2020 <- cmps_2020 %>%
  mutate(Mexican = ifelse(S10 == 12 |
                          S10_Mex == 1,
                          1,
                          0)
  )

## 9. USborn: S7 == 1 (Only one that matters)
## 10. USborn_parents: Q809 == 1
## 11. USborn_gparents: Q809 == 2



cmps_2020 <- cmps_2020 %>%
  mutate(USborn = ifelse(S7 == 1,
                         1,
                         0)
  )

cmps_2020$USborn_parents <- NA
cmps_2020$USborn_gparents <- NA


## 12. Party = Q22
## 13. Party7 = Q23, Q24

diag_df <- cmps_2020 %>%
  group_by(Q22,
           Q23,
           Q24) %>%
  reframe(count=n())

cmps_2020 <- cmps_2020 %>%
  mutate(
    Party = case_when(
      Q21 == 1 | Q23 == 1 ~ "Republican",
      Q21 == 2 | Q23 == 2 ~ "Democrat",
      Q21 == 3 ~ "Independent",
      TRUE ~ "Other"),
    Party7 = case_when(
      Q21 == 1 & Q22 == 1 ~ "Strong Republican",
      Q21 == 1 & Q22 == 2 ~ "Weak Republican",
      Q21 == 3 & Q23 == 1 ~ "Lean Republican",
      Q23 == 3 ~ "Independent",
      Q23 == 2 ~ "Lean Democrat",
      Q21 == 2 & Q22 == 2 ~ "Weak Democrat",
      Q21 == 2 & Q22 == 1 ~ "Strong Democrat",
      TRUE ~ "Other"
    )
  )

diag2_df <- cmps_2020 %>%
  group_by(Party,
           Party7,) %>%
  reframe(count=n())
  

## 14. date. Leave NA for now


cmps_2020$date <- "NA"


## 15. birthyear = birthyr

## Made previously

## 16. education: S13

diag_df <- cmps_2020 %>%
  group_by(S13) %>%
  reframe(count=n())

## Just has to be CollGrad binary 

cmps_2020 <- cmps_2020 %>%
  mutate(education = ifelse(
    S13 %in% c(5,6,7),
    1,
    0)
  )


## 17. origin

diag_df <- cmps_2020 %>%
  group_by(S10) %>%
  reframe(count=n())


cmps_2020 <- cmps_2020 %>%
  mutate(origin = case_when(
    S10 == 12 | S10_Mex == 1 ~ "Mexican",
    S10 == 17 ~ "Puerto Rican",
    S10 == 6 ~ "Cuban",
    S10 == 7 ~ "Dominican",
    S10 == 9 ~ "Salvadoran",
    TRUE ~ "Other"
  ))

diag_df <- cmps_2020 %>%
  group_by(origin, Mexican) %>%
  reframe(count=n())
  

## 18. Weight_AllLatinos = NA for now

cmps_2020 <- cmps_2020 %>%
  mutate(Weight_AllLatinos = NA)


## 19. region = NA for now (done in next data step)

cmps_2020 <- cmps_2020 %>%
  mutate(region = NA)


## 20: generation

cmps_2020 <- cmps_2020 %>%
    mutate(generation = case_when(
    birthyear < 1928 ~ "Greatest",
    birthyear > 1927 & 
      birthyear < 1946 ~ "Silent",
    birthyear > 1945 & 
      birthyear < 1965 ~ "Boomer",
    birthyear > 1964 & 
      birthyear < 1981 ~ "GenX",
    birthyear > 1980 & 
      birthyear < 1997 ~ "Millennial",
    birthyear > 1996 & 
      birthyear < 2020 ~ "GenZ",
    TRUE ~ "Missing")
  )

diag_df <- cmps_2020 %>%
  group_by(generation,
           birthyear) %>%
  reframe(count=n())


## Merge in

## Final selection

## Variables to get:

names(df1)

#  [1] "Year"              "Survey"            "id"                "Weight"            "state"             "sex"              
#  [7] "age"               "Mexican"           "USborn"            "USborn_parents"    "USborn_gparents"   "Party"            
# [13] "Party7"            "date"              "birthyear"         "education"         "origin"            "Weight_AllLatinos"
# [19] "region"            "generation"

cmps_2020 <- cmps_2020 %>%
select(Year, Survey, id, Weight, Weight_AllLatinos, state, region, sex, age, Mexican, USborn, USborn_parents, USborn_gparents, Party, Party7, date, birthyear, education, origin, citizen)

names(df_collected) == names(cmps_2020)

df_collected <- rbind(df_collected,
                      cmps_2020)

```




```{r Final data checks}

names(df_collected)


## PID check

diag_df <- df_collected %>%
  group_by(Party,
           Party7) %>%
  reframe(count=n())

## National origin check

diag_df <- df_collected %>%
  group_by(Year, Survey, origin) %>%
  reframe(count=n())


write_csv(df_collected,
          file = "collected_surveys_all.csv")


```

